From 601b6624f0ac0b630b4cfac4bc29d03f62759266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 12 Apr 2015 20:25:36 +0200 Subject: [PATCH 01/40] Ignore compiled script --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 96e4050..de16f90 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ /.coverage /docs/_build /data +/bin/propexc From 3716caf3f9efc79e53f5e130aa0700b04c8b7a88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 12 Apr 2015 16:13:55 +0200 Subject: [PATCH 02/40] More general parsing of genbank features --- propex/genbank.py | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index a4f0ce8..167ed8c 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -283,9 +283,7 @@ def features_at_location(self, location): overlapping the location. """ features = [] - for feat in GenBank.features: - if feat not in self.features: - continue + for feat in self.features.iterkeys(): for feature in self.features[feat]: if feature.location.overlaps(location): features.append(feature) @@ -380,9 +378,6 @@ class GenBank(object): :raises: ValueError if parsing fails. """ - #: List of supported features. - features = ['CDS'] - def __init__(self, fname): """GenBank constructor. @@ -390,7 +385,7 @@ def __init__(self, fname): fname: filename of the GenBank file. """ self.filename = fname - self.index = self._index() + self.index, self.features = self._index() def _index(self): """Create and index of a the GenBank object. @@ -399,7 +394,9 @@ def _index(self): a list of dictionaries where each element in the list represents a locus. """ + features = set() indexdicts = [] + in_features = False with open(self.filename) as f: offset = 0 for lineno, line in enumerate(f): @@ -411,24 +408,30 @@ def _index(self): indexdicts.append({}) indexdicts[-1]['name'] = current_locus indexdicts[-1]['offset'] = offset - if line.strip().split()[0] == 'CDS': - if 'CDS' not in indexdicts[-1]: - indexdicts[-1]['CDS'] = [] - indexdicts[-1]['CDS'].append({ + if line.strip().split()[0] == 'ORIGIN': + indexdicts[-1]['ORIGIN'] = offset + len(line) + in_features = False + if in_features and line[5] != ' ': + feature = line.strip().split()[0] + features.add(feature) + if feature not in indexdicts[-1]: + indexdicts[-1][feature] = [] + indexdicts[-1][feature].append({ 'offset': offset, 'location': Location(line.strip().split()[1]) }) - if line.strip().split()[0] == 'ORIGIN': - indexdicts[-1]['ORIGIN'] = offset + len(line) + if line.startswith('FEATURES'): + in_features = True offset += len(line) - # Sort the CDS according to start position - for s in indexdicts: - if 'CDS' not in s: - continue - s['CDS'] = sorted(s['CDS'], key=lambda x: x['location'].start) + # Sort the features according to start position + for f in features: + for s in indexdicts: + if f not in s: + continue + s[f] = sorted(s[f], key=lambda x: x['location'].start) - return indexdicts + return indexdicts, features def __getitem__(self, index): """Get a specific GenBankLocus object. From 3d3a8181298480bd962493e8f6c7f22d41331ce4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 12 Apr 2015 20:02:37 +0200 Subject: [PATCH 03/40] Add custom exceptions --- bin/propex | 2 +- propex/genbank.py | 14 ++++++++++---- propex/tests/test_genbank.py | 4 ++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/bin/propex b/bin/propex index 31d5024..7fe112f 100644 --- a/bin/propex +++ b/bin/propex @@ -32,7 +32,7 @@ def get_single_sequence(fname, genbank_only=False, stop_on_error=False): try: seq = propex.GenBank(fname) genbank_success = True - except ValueError: + except propex.genbank.ParsingError: pass if not genbank_success and not genbank_only: diff --git a/propex/genbank.py b/propex/genbank.py index 167ed8c..7a5fc17 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -10,6 +10,9 @@ from propex.sequence import Sequence +class LocationError(Exception): + pass + class Location(object): """Represent a GenBank feature location. @@ -93,7 +96,7 @@ def _parse(self): is located on the complement strand. Returned positions are 0-based. Raises: - ValueError: if the location string is not valid. + LocationError: if the location string is not valid. """ locstring = self.locstring re_name = None @@ -107,7 +110,7 @@ def _parse(self): re_name = name regex = r if re_name is None: - raise ValueError('unknown location string: {0}'.format(self.locstring)) + raise LocationError('unknown location string: {0}'.format(self.locstring)) if re_name == 'single': start = end = int(regex.match(locstring).group(1)) @@ -366,6 +369,9 @@ def _neighbor(self, feature, downstream=True): return self.features[ftype][findex] +class ParsingError(Exception): + pass + class GenBank(object): """Represent a GenBank file. @@ -375,7 +381,7 @@ class GenBank(object): * index: a list of dictionaries representing an index of the file. :param fname: filename of the GenBank file. - :raises: ValueError if parsing fails. + :raises: :py:exc:`.ParsingError` if parsing fails. """ def __init__(self, fname): @@ -401,7 +407,7 @@ def _index(self): offset = 0 for lineno, line in enumerate(f): if lineno == 0 and not line.strip().startswith('LOCUS'): - raise ValueError('does not look like a GenBank file: {0}' \ + raise ParsingError('does not look like a GenBank file: {0}' \ .format(self.filename)) if line.strip().split()[0] == 'LOCUS': current_locus = line.strip().split()[1] diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index 30c58e5..d25a90d 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -62,7 +62,7 @@ def test_get_locus_from_name(self): assert len(loci) > 0 assert len(loci[0].seq) == 8967 - @raises(ValueError) + @raises(propex.genbank.ParsingError) def test_parse_fasta(self): gb = propex.GenBank(os.path.join(self.genbankdir, '..', 'data_fasta', 'LMG718-cremoris.fasta')) @@ -331,7 +331,7 @@ def test_overlap(self): assert not loc4.overlaps(loc3) assert not loc1.overlaps(loc5) - @raises(ValueError) + @raises(propex.genbank.LocationError) def test_invalid_location(self): loc = Location('123..noloc') From b3682453911fd00c24def000d30e2cac9e02c978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 12 Apr 2015 20:07:48 +0200 Subject: [PATCH 04/40] Make location regexes private --- propex/genbank.py | 29 ++++++++++++++--------------- propex/tests/test_genbank.py | 20 ++++++++++---------- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index 7a5fc17..c70733a 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -21,7 +21,7 @@ class Location(object): http://www.insdc.org/files/feature_table.html#3.4 Location (and GenBank files) are using 1-based positions. To make - location-bases string handling easier, the Location class represents + location-based string handling easier, the Location class represents the locations internally as 0-based:: >>> loc = Location('42..84') @@ -50,18 +50,17 @@ class Location(object): """ #: Regular expression for finding complement locations. - loc_complement = re.compile(r'^complement\((.+)\)$') - + _re_complement = re.compile(r'^complement\((.+)\)$') #: Regular expression for single base locations. - loc_single = re.compile(r'^(\d+)$') + _re_single = re.compile(r'^(\d+)$') #: Regular expression for range locations, - loc_range = re.compile(r'^(\d+)\.\.(\d+)$') + _re_range = re.compile(r'^(\d+)\.\.(\d+)$') #: Regular expression for locations with unknown lower boundary. - loc_lower_unknown = re.compile(r'^<(\d+)\.\.(\d+)$') + _re_lower_unknown = re.compile(r'^<(\d+)\.\.(\d+)$') #: Regular expression for locations with unknown upper boundary. - loc_upper_unknown = re.compile(r'^(\d+)\.\.>(\d+)$') + _re_upper_unknown = re.compile(r'^(\d+)\.\.>(\d+)$') #: Regular expression for single base locations within a range. - loc_one_of = re.compile(r'^(\d+)\.(\d+)$') + _re_one_of = re.compile(r'^(\d+)\.(\d+)$') def __init__(self, locstring): """Location constructor. @@ -80,11 +79,11 @@ def _regex_dict(self): expression. """ return { - 'single': Location.loc_single, - 'range': Location.loc_range, - 'upper_unknown': Location.loc_lower_unknown, - 'lower_unknown': Location.loc_upper_unknown, - 'one_of': Location.loc_one_of + 'single': Location._re_single, + 'range': Location._re_range, + 'upper_unknown': Location._re_lower_unknown, + 'lower_unknown': Location._re_upper_unknown, + 'one_of': Location._re_one_of } def _parse(self): @@ -102,9 +101,9 @@ def _parse(self): re_name = None regex = None is_complement = False - if Location.loc_complement.match(locstring): + if Location._re_complement.match(locstring): is_complement = True - locstring = Location.loc_complement.match(locstring).group(1) + locstring = Location._re_complement.match(locstring).group(1) for name, r in self._regex_dict().iteritems(): if r.match(locstring) is not None: re_name = name diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index d25a90d..8df9d77 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -218,30 +218,30 @@ def setUp(self): self.complement2 = 'complement(467)' def test_complement(self): - match = Location.loc_complement.match(self.complement) + match = Location._re_complement.match(self.complement) assert match assert match.group(1) == '340..565' - match = Location.loc_complement.match(self.complement2) + match = Location._re_complement.match(self.complement2) assert match assert match.group(1) == '467' - match = Location.loc_complement.match(self.one_of) + match = Location._re_complement.match(self.one_of) assert match is None def test_range_regex(self): - match = Location.loc_range.match(self.range) - assert Location.loc_one_of.match(self.range) is None + match = Location._re_range.match(self.range) + assert Location._re_one_of.match(self.range) is None assert match assert match.group(1) == '340' assert match.group(2) == '565' def test_single_regex(self): - match = Location.loc_single.match(self.single) + match = Location._re_single.match(self.single) assert match assert match.group(1) == '467' def test_lower_unknown(self): - match1 = Location.loc_lower_unknown.match(self.lower_unknown) - match2 = Location.loc_lower_unknown.match(self.lower_unknown2) + match1 = Location._re_lower_unknown.match(self.lower_unknown) + match2 = Location._re_lower_unknown.match(self.lower_unknown2) assert match1 assert match1.group(1) == '345' assert match1.group(2) == '500' @@ -250,13 +250,13 @@ def test_lower_unknown(self): assert match2.group(2) == '888' def test_upper_unknown(self): - match = Location.loc_upper_unknown.match(self.upper_unknown) + match = Location._re_upper_unknown.match(self.upper_unknown) assert match assert match.group(1) == '1' assert match.group(2) == '888' def test_one_of(self): - match = Location.loc_one_of.match(self.one_of) + match = Location._re_one_of.match(self.one_of) assert match assert match.group(1) == '102' assert match.group(2) == '110' From 6f72b1628cf6ee944cc178e91a7730db16d81b62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 12 Apr 2015 20:20:12 +0200 Subject: [PATCH 05/40] Add missing case of unknown boundaries There was no way of identifying a location with both unkown lower and upper boundary. This could definitely be done more elegantly, but that is something for later. --- propex/genbank.py | 3 +++ propex/tests/test_genbank.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/propex/genbank.py b/propex/genbank.py index c70733a..19fbca0 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -59,6 +59,8 @@ class Location(object): _re_lower_unknown = re.compile(r'^<(\d+)\.\.(\d+)$') #: Regular expression for locations with unknown upper boundary. _re_upper_unknown = re.compile(r'^(\d+)\.\.>(\d+)$') + #: Regular expression for locations with unknown upper and lower boundary. + _re_lower_upper_unknown = re.compile(r'^<(\d+)\.\.>(\d+)$') #: Regular expression for single base locations within a range. _re_one_of = re.compile(r'^(\d+)\.(\d+)$') @@ -83,6 +85,7 @@ def _regex_dict(self): 'range': Location._re_range, 'upper_unknown': Location._re_lower_unknown, 'lower_unknown': Location._re_upper_unknown, + 'lower_upper_unkown': Location._re_lower_upper_unknown, 'one_of': Location._re_one_of } diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index 8df9d77..75ccbc3 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -213,6 +213,7 @@ def setUp(self): self.lower_unknown = '<345..500' self.lower_unknown2 = '<1..888' self.upper_unknown = '1..>888' + self.lower_upper_unkown = '<1..>888' self.one_of = '102.110' self.complement = 'complement(340..565)' self.complement2 = 'complement(467)' @@ -255,6 +256,12 @@ def test_upper_unknown(self): assert match.group(1) == '1' assert match.group(2) == '888' + def test_lower_upper_unknown(self): + match = Location._re_lower_upper_unknown.match(self.lower_upper_unkown) + assert match + assert match.group(1) == '1' + assert match.group(2) == '888' + def test_one_of(self): match = Location._re_one_of.match(self.one_of) assert match From e4764c29e3026d663cfe05c62970a2df81c74ae6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 12 Apr 2015 20:21:32 +0200 Subject: [PATCH 06/40] Add safeguard for empty lines --- propex/genbank.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/propex/genbank.py b/propex/genbank.py index 19fbca0..ad721f5 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -411,6 +411,8 @@ def _index(self): if lineno == 0 and not line.strip().startswith('LOCUS'): raise ParsingError('does not look like a GenBank file: {0}' \ .format(self.filename)) + if len(line.strip()) == 0: + continue if line.strip().split()[0] == 'LOCUS': current_locus = line.strip().split()[1] indexdicts.append({}) From 1026a107dcb6509d79b23b836462352a005131b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 12 Apr 2015 20:23:09 +0200 Subject: [PATCH 07/40] Split genbank tests general and local The local class requires some files that I cannot put in the repo. Instead I split the class into two: one for these local files and one for the files I add to the repo. Currently there's only one file in the repo. --- propex/tests/data/U49845.gb | 167 +++++++++++++++++++++++++++++++++++ propex/tests/test_genbank.py | 10 +++ 2 files changed, 177 insertions(+) create mode 100644 propex/tests/data/U49845.gb diff --git a/propex/tests/data/U49845.gb b/propex/tests/data/U49845.gb new file mode 100644 index 0000000..c56f129 --- /dev/null +++ b/propex/tests/data/U49845.gb @@ -0,0 +1,167 @@ +LOCUS SCU49845 5028 bp DNA linear PLN 23-MAR-2010 +DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds; and Axl2p + (AXL2) and Rev7p (REV7) genes, complete cds. +ACCESSION U49845 +VERSION U49845.1 GI:1293613 +KEYWORDS . +SOURCE Saccharomyces cerevisiae (baker's yeast) + ORGANISM Saccharomyces cerevisiae + Eukaryota; Fungi; Dikarya; Ascomycota; Saccharomycotina; + Saccharomycetes; Saccharomycetales; Saccharomycetaceae; + Saccharomyces. +REFERENCE 1 (bases 1 to 5028) + AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. + TITLE Selection of axial growth sites in yeast requires Axl2p, a novel + plasma membrane glycoprotein + JOURNAL Genes Dev. 10 (7), 777-793 (1996) + PUBMED 8846915 +REFERENCE 2 (bases 1 to 5028) + AUTHORS Roemer,T. + TITLE Direct Submission + JOURNAL Submitted (22-FEB-1996) Biology, Yale University, New Haven, CT + 06520, USA +FEATURES Location/Qualifiers + source 1..5028 + /organism="Saccharomyces cerevisiae" + /mol_type="genomic DNA" + /db_xref="taxon:4932" + /chromosome="IX" + mRNA <1..>206 + /product="TCP1-beta" + CDS <1..206 + /codon_start=3 + /product="TCP1-beta" + /protein_id="AAA98665.1" + /db_xref="GI:1293614" + /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA + AEVLLRVDNIIRARPRTANRQHM" + gene <687..>3158 + /gene="AXL2" + mRNA <687..>3158 + /gene="AXL2" + /product="Axl2p" + CDS 687..3158 + /gene="AXL2" + /note="plasma membrane glycoprotein" + /codon_start=1 + /product="Axl2p" + /protein_id="AAA98666.1" + /db_xref="GI:1293615" + /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF + TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN + VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE + VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE + TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV + YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG + DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ + DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA + NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA + CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN + NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ + SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS + YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK + HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL + VDFSNKSNVNVGQVKDIHGRIPEML" + gene complement(<3300..>4037) + /gene="REV7" + mRNA complement(<3300..>4037) + /gene="REV7" + /product="Rev7p" + CDS complement(3300..4037) + /gene="REV7" + /codon_start=1 + /product="Rev7p" + /protein_id="AAA98667.1" + /db_xref="GI:1293616" + /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ + FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD + KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR + RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK + LISGDDKILNGVYSQYEEGESIFGSLF" +ORIGIN + 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg + 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct + 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa + 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg + 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa + 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa + 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat + 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga + 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc + 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga + 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta + 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag + 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa + 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata + 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga + 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac + 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg + 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc + 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa + 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca + 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac + 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa + 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag + 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct + 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac + 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa + 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc + 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata + 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca + 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc + 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc + 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca + 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc + 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg + 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt + 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc + 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg + 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca + 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata + 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg + 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga + 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt + 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat + 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt + 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc + 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag + 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta + 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa + 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact + 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt + 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa + 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag + 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct + 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt + 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact + 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa + 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg + 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt + 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc + 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca + 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc + 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc + 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat + 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa + 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga + 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat + 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc + 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc + 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa + 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg + 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc + 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt + 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg + 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg + 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt + 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt + 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat + 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc + 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct + 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta + 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac + 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct + 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct + 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc +// + diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index 75ccbc3..140452f 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -7,6 +7,16 @@ class TestGenBank: + def setUp(self): + self.testdir = os.path.dirname(__file__) + self.sc = os.path.join(self.testdir, 'data', 'U49845.gb') + + def test_sequence_length(self): + gb = propex.GenBank(self.sc) + assert len(gb[0].seq) == 5028 + +class TestGenBankLocal: + def setUp(self): self.testdir = os.path.dirname(__file__) self.genbankdir = os.path.join(os.path.expanduser('~'), 'Dropbox', From a170badac60d10b9561dba2c5bd32381f38fcec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 12 Apr 2015 21:33:03 +0200 Subject: [PATCH 08/40] Add all features to the genbank locus --- propex/genbank.py | 24 +++++++++++++++--------- propex/tests/test_genbank.py | 16 ++++++++++++++-- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index ad721f5..6de0457 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -289,6 +289,11 @@ def features_at_location(self, location): """ features = [] for feat in self.features.iterkeys(): + # Skip the source feature since it always will + # overlap (assuming that the feature location + # is within the sequence boundaries). + if feat == 'source': + continue for feature in self.features[feat]: if feature.location.overlaps(location): features.append(feature) @@ -452,20 +457,21 @@ def __getitem__(self, index): locus_index = self.index[index] locus_offset = locus_index['offset'] origin_offset = locus_index['ORIGIN'] - features = {'CDS': []} + features = collections.defaultdict(list) with open(self.filename) as f: - # Get the CDSs - if 'CDS' in locus_index: - for cds in locus_index['CDS']: - f.seek(cds['offset']) - cds_string = f.readline() + for ftype in self.features: + if ftype not in locus_index: + continue + for feature in locus_index[ftype]: + f.seek(feature['offset']) + feature_string = f.readline() line = f.readline() while line[5] == ' ': - cds_string += line + feature_string += line line = f.readline() - features['CDS'].append( + features[ftype].append( GenBankFeature.from_string(locus_index['name'], - cds_string)) + feature_string)) # Get the sequence f.seek(origin_offset) diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index 140452f..06b46c8 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -10,10 +10,22 @@ class TestGenBank: def setUp(self): self.testdir = os.path.dirname(__file__) self.sc = os.path.join(self.testdir, 'data', 'U49845.gb') + self.gb = propex.GenBank(self.sc) def test_sequence_length(self): - gb = propex.GenBank(self.sc) - assert len(gb[0].seq) == 5028 + assert len(self.gb[0].seq) == 5028 + + def test_mRNA(self): + assert len(self.gb[0].features['mRNA']) == 3 + + def test_next_downstream(self): + locus = self.gb[0] + gbf = locus.features['mRNA'][0] + assert gbf is not None + assert str(gbf.location) == '<1..>206' + next = locus.next_downstream(gbf) + assert next is not None + assert str(next.location) == '<687..>3158' class TestGenBankLocal: From 608d58b2170482200e32c858c4b257ddcd7b93b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 12 Apr 2015 20:21:32 +0200 Subject: [PATCH 09/40] Add safeguard for empty lines --- propex/genbank.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/propex/genbank.py b/propex/genbank.py index a4f0ce8..bf03c40 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -406,6 +406,8 @@ def _index(self): if lineno == 0 and not line.strip().startswith('LOCUS'): raise ValueError('does not look like a GenBank file: {0}' \ .format(self.filename)) + if len(line.strip()) == 0: + continue if line.strip().split()[0] == 'LOCUS': current_locus = line.strip().split()[1] indexdicts.append({}) From 8ebc84b04fe3e7114655239f31e4f1655be93907 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Tue, 14 Apr 2015 20:58:07 +0200 Subject: [PATCH 10/40] Fix empty line bug --- propex/genbank.py | 1 + propex/tests/data/U49845.gb | 1 + 2 files changed, 2 insertions(+) diff --git a/propex/genbank.py b/propex/genbank.py index 6de0457..55d9b77 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -417,6 +417,7 @@ def _index(self): raise ParsingError('does not look like a GenBank file: {0}' \ .format(self.filename)) if len(line.strip()) == 0: + offset += len(line) continue if line.strip().split()[0] == 'LOCUS': current_locus = line.strip().split()[1] diff --git a/propex/tests/data/U49845.gb b/propex/tests/data/U49845.gb index c56f129..c053364 100644 --- a/propex/tests/data/U49845.gb +++ b/propex/tests/data/U49845.gb @@ -62,6 +62,7 @@ FEATURES Location/Qualifiers YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL VDFSNKSNVNVGQVKDIHGRIPEML" + gene complement(<3300..>4037) /gene="REV7" mRNA complement(<3300..>4037) From 64c1c2cb81e8ee4d61ab87838a7dac2f4095a0ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Tue, 14 Apr 2015 20:58:07 +0200 Subject: [PATCH 11/40] Fix empty line bug Conflicts: propex/tests/data/U49845.gb --- propex/genbank.py | 1 + 1 file changed, 1 insertion(+) diff --git a/propex/genbank.py b/propex/genbank.py index bf03c40..cc24b64 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -407,6 +407,7 @@ def _index(self): raise ValueError('does not look like a GenBank file: {0}' \ .format(self.filename)) if len(line.strip()) == 0: + offset += len(line) continue if line.strip().split()[0] == 'LOCUS': current_locus = line.strip().split()[1] From f2846be56478f2bad9cbcedf3a6913a75c8cb333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Thu, 16 Apr 2015 20:39:23 +0200 Subject: [PATCH 12/40] Allow directories in input directory --- propex/tests/test_genbank.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index 06b46c8..257b8f3 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -59,7 +59,8 @@ def test_iteration(self): def test_load_directory(self): gbs = [propex.GenBank(os.path.join(self.genbankdir, x)) \ - for x in os.listdir(self.genbankdir)] + for x in os.listdir(self.genbankdir) \ + if os.path.isfile(os.path.join(self.genbankdir, x))] def test_features_at_location(self): gb = propex.GenBank(self.lmg718) From afbe591ddb8dcd80cdc0683ca10ec734974c7bcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Fri, 17 Apr 2015 15:41:31 +0200 Subject: [PATCH 13/40] Parse GenBank header The results are not yet used, but should go into the GenBankLocus class. --- propex/genbank.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/propex/genbank.py b/propex/genbank.py index 55d9b77..68f8a7f 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -449,6 +449,78 @@ def _index(self): return indexdicts, features + def _parse_header(self, hstring): + """Parse a GenBank header string into a nested dictionary. + """ + head_data = collections.OrderedDict() + + header_lines = iter(hstring.splitlines(True)) + + line = header_lines.next() + + header = line.strip().split() + + name = header[1] + length = ' '.join(header[2:4]) + molecule = header[4] + molecule_type = header[5] + + if len(header) == 8: + division = header[6] + date = header[7] + elif len(header) == 7: + division = '' + date = header[6] + + head_data['LOCUS'] = { + 'name': name, + 'length': length, + 'molecule': molecule, + 'molecule_type': molecule_type, + 'genbank_division': division, + 'modification_data': date + } + + last_key = None + line = header_lines.next() + while True: + if line[0] != ' ': + key = line[:11].strip() + last_key = key + if key in head_data: + old_entry = head_data[key] + if not isinstance(old_entry, list): + head_data[key] = [(old_entry, + collections.OrderedDict())] + head_data[key].append((line[11:].strip(), + collections.OrderedDict())) + else: + head_data[key] = line[11:].strip() + elif len(line[:11].strip()) != 0: + sub_key = line[:11].strip() + old_entry = head_data[last_key] + if not isinstance(old_entry, list): + head_data[last_key] = [(old_entry, + collections.OrderedDict())] + old_entry = head_data[key][-1] + old_entry[1][sub_key] = line[11:].strip() + head_data[key][-1] = (old_entry[0], old_entry[1]) + else: + if isinstance(head_data[last_key], list): + sub_key = head_data[last_key][-1][1].keys()[-1] + head_data[last_key][-1][1][sub_key] = \ + '\n'.join([head_data[last_key][-1][1][sub_key], + line.strip()]) + else: + head_data[last_key] = '\n'.join([head_data[last_key], + line.strip()]) + try: + line = header_lines.next() + except StopIteration: + break + + return head_data + def __getitem__(self, index): """Get a specific GenBankLocus object. @@ -459,7 +531,20 @@ def __getitem__(self, index): locus_offset = locus_index['offset'] origin_offset = locus_index['ORIGIN'] features = collections.defaultdict(list) + + headstring = '' + with open(self.filename) as f: + f.seek(locus_offset) + headstring += f.readline() + + line = f.readline() + while not line.startswith('FEATURES'): + headstring += line + line = f.readline() + + head_data = self._parse_header(headstring) + for ftype in self.features: if ftype not in locus_index: continue From 731426d39f0b47a502929ad75d2fb19f9848f5e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Fri, 17 Apr 2015 15:42:48 +0200 Subject: [PATCH 14/40] Shorten long lines --- propex/genbank.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index 68f8a7f..70f08e5 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -112,7 +112,8 @@ def _parse(self): re_name = name regex = r if re_name is None: - raise LocationError('unknown location string: {0}'.format(self.locstring)) + raise LocationError('unknown location string: {0}' \ + .format(self.locstring)) if re_name == 'single': start = end = int(regex.match(locstring).group(1)) @@ -134,7 +135,8 @@ def min_distance(self, other): if self.overlaps(other): return 0 else: - return min(abs(self.start - other.end), abs(self.end - other.start)) + return min(abs(self.start - other.end), + abs(self.end - other.start)) @classmethod def from_int(cls, start, end=None, strand='+'): @@ -165,7 +167,8 @@ class GenBankFeature(object): **Class attributes:** * **feature_type**: a string with the feature key. - * **location**: a Location object representing the location of the feature. + * **location**: a Location object representing the location of + the feature. * **qualifiers**: a dictionary of qualifiers of the feature. :param locus: the name of the locus that the feature belongs to. @@ -181,7 +184,8 @@ def __init__(self, locus, feature_type, location, qualifiers=None): Args: locus: the locus that the feature belongs to. feature_type: the key of the feature, e.g. 'CDS' or 'tRNA'. - location: a Location object representing the location of the feature + location: a Location object representing the location of the + feature qualifiers: a dictionary of qualifiers with the qualifier names as keys and the qualifier values as values. """ @@ -362,7 +366,8 @@ def _neighbor(self, feature, downstream=True): break - if findex is None or findex >= len(self.features[ftype]) or findex < 0: + if findex is None or findex >= len(self.features[ftype]) or \ + findex < 0: return None # Make sure the feature is on the same strand @@ -414,8 +419,9 @@ def _index(self): offset = 0 for lineno, line in enumerate(f): if lineno == 0 and not line.strip().startswith('LOCUS'): - raise ParsingError('does not look like a GenBank file: {0}' \ - .format(self.filename)) + raise ParsingError( + 'does not look like a GenBank file: {0}' \ + .format(self.filename)) if len(line.strip()) == 0: offset += len(line) continue From 39527343bfa08214a2ae81e513d1b896615f6ee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Fri, 17 Apr 2015 15:56:49 +0200 Subject: [PATCH 15/40] Add header to GenBankLocus + test --- propex/genbank.py | 9 +++++++-- propex/tests/test_genbank.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index 70f08e5..1117d7f 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -267,7 +267,7 @@ class GenBankLocus(object): :param features: a dictionary containing features of the locus. """ - def __init__(self, name, seq, features=None): + def __init__(self, name, seq, features=None, header=None): """GenBankLocus constructor. Args: @@ -281,6 +281,10 @@ def __init__(self, name, seq, features=None): self.features = {} else: self.features = features + if header is None: + self.header = {} + else: + self.header = header def features_at_location(self, location): """Get features at a location. @@ -573,7 +577,8 @@ def __getitem__(self, index): line = f.readline() seq += ''.join(line.strip().split()[1:]) - return GenBankLocus(locus_index['name'], Sequence(seq), features) + return GenBankLocus(locus_index['name'], Sequence(seq), features, + head_data) def get_locus_from_name(self, name): """Get a specific GenBankLocus object from the locus name. diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index 257b8f3..ed8896c 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -27,6 +27,21 @@ def test_next_downstream(self): assert next is not None assert str(next.location) == '<687..>3158' + def test_header(self): + header = self.gb[0].header + + assert all(x in header for x in ['LOCUS', 'DEFINITION', + 'ACCESSION', 'VERSION', 'KEYWORDS', 'SOURCE', 'REFERENCE']) + + assert header['LOCUS']['molecule'] == 'DNA' + + assert header['ACCESSION'] == 'U49845' + + assert len(header['REFERENCE']) == 2 + assert header['REFERENCE'][0][0] == '1 (bases 1 to 5028)' + assert all(x in header['REFERENCE'][0][1] for x in ['AUTHORS', + 'TITLE', 'JOURNAL', 'PUBMED']) + class TestGenBankLocal: def setUp(self): From 2d7eef0639d721495a0d0d3045a330e83c29de63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sat, 18 Apr 2015 22:46:24 +0200 Subject: [PATCH 16/40] Handle join locations The genbank module is now able to handle join locations, including join locations that are wrapped with complement. The comparison operator is currently a bit misleading since the locations join(complement(...),complement(...)) and complement(join(...,...)) will not be equal. Will probably be fixed in the future. --- propex/genbank.py | 141 +++++++++++++++++++++++++++++++++-- propex/tests/test_genbank.py | 86 ++++++++++++++++++++- 2 files changed, 219 insertions(+), 8 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index 1117d7f..a34f78f 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -6,6 +6,7 @@ """ import collections +import itertools import re from propex.sequence import Sequence @@ -13,6 +14,120 @@ class LocationError(Exception): pass +def parse_location(locstring): + """Parse a location string and return a :py:class:`.Location` + or :py:class:`.JoinLocation` object. + + :param locstring: a GenBank location string. + :raises: :py:class:`.LocationError` if parsing fails. + """ + if locstring.startswith('join') or \ + locstring.startswith('complement(join'): + return JoinLocation(locstring) + else: + return Location(locstring) + +class JoinLocation(object): + + """Represent a "join" GenBank feature location. + + For more information on locations, see + http://www.insdc.org/files/feature_table.html#3.4 + + For information on how locations work, see :py:class:`.Location`. + + :param locstring: a GenBank location string. + :raises: :py:class:`.LocationError` if parsing fails. + """ + + def __init__(self, locstring): + self.locstring = locstring + self.loctype = 'join' + self.locations, self.start, self.end, self.is_complement = \ + self._get_locations() + + def _get_locations(self): + complement_wrap = False + if self.locstring.startswith('complement'): + compmatch = re.match(r'complement\((join\(.+\))\)', self.locstring) + if compmatch is None: + raise LocationError('invalid join location: {0}' \ + .format(self.locstring)) + locstring = compmatch.group(1) + complement_wrap = True + else: + locstring = self.locstring + + match = re.match(r'^join\((.+)\)$', locstring) + if match is None: + raise LocationError('invalid join location: {0}' \ + .format(self.locstring)) + locstring = match.group(1) + locations = [Location(x.strip()) for x in locstring.split(',')] + if len(set(x.is_complement for x in locations)) != 1: + raise LocationError('joint location is located on both strands') + start = min(x.start for x in locations) + end = max(x.end for x in locations) + if not complement_wrap: + is_complement = locations[0].is_complement + else: + is_complement = True + return locations, start, end, is_complement + + def overlaps(self, other): + """Test whether the location overlaps with another location. + + :param other: a :py:class:`.Location` or :py:class:`.JoinLocation` + object. + :returns: True if the locations overlap with at least one base, + otherwise False. + """ + if isinstance(other, JoinLocation): + for loc1, loc2 in itertools.product(self.locations, + other.locations): + if loc1.overlaps(loc2): + return True + else: + for loc in self.locations: + if loc.overlaps(other): + return True + return False + + def min_distance(self, other): + """Get the minimum distance to another location. + + :param other: a :py:class:`.Location` or :py:class:`.JoinLocation` + object. + :returns: the minimum distance between the locations. + """ + min_acc = [] + if isinstance(other, JoinLocation): + for loc1, loc2 in itertools.product(self.locations, + other.locations): + d = loc1.min_distance(loc2) + if d == 0: + return 0 + min_acc.append(d) + else: + for loc in self.locations: + d = loc.min_distance(other) + if d == 0: + return 0 + min_acc.append(d) + return min(min_acc) + + def __str__(self): + return self.locstring + + def __eq__(self, other): + return str(self) == str(other) + + def __ne__(self, other): + return not self == other + + def __repr__(self): + return ''.format(repr(self.locstring)) + class Location(object): """Represent a GenBank feature location. @@ -47,6 +162,7 @@ class Location(object): the complement of the sequence. :param locstring: a GenBank location string. + :raises: :py:class:`.LocationError` if parsing fails. """ #: Regular expression for finding complement locations. @@ -77,7 +193,7 @@ def _regex_dict(self): Returns: a dictionary where the keys correspond to the type of - location and the values are the correpsponding regular + location and the values are the corresponding regular expression. """ return { @@ -122,16 +238,27 @@ def _parse(self): return re_name, start - 1, end - 1, is_complement - def overlaps(self, location): + def overlaps(self, other): """Test whether the location overlaps with another location. - :param location: a Location object. + :param other: a :py:class:`.Location` or :py:class:`.JoinLocation` + object. :returns: True if the locations overlap with at least one base, otherwise False. """ - return self.start <= location.end and location.start <= self.end + if isinstance(other, JoinLocation): + return other.overlaps(self) + return self.start <= other.end and other.start <= self.end def min_distance(self, other): + """Get the minimum distance to another location. + + :param other: a :py:class:`.Location` or :py:class:`.JoinLocation` + object. + :returns: the minimum distance between the locations. + """ + if isinstance(other, JoinLocation): + return other.min_distance(self) if self.overlaps(other): return 0 else: @@ -238,14 +365,14 @@ def from_string(cls, locus, feature_string): value = qualifiers[-1][1] + line.strip('"') qualifiers[-1] = (key, value) - return cls(locus, ftype, Location(location), dict(qualifiers)) + return cls(locus, ftype, parse_location(location), dict(qualifiers)) def get_qualifier(self, qualifier_name): """Get a feature qualifier. :param qualifier_name: a string representing a qualifier. :returns: the value of the qualifier. - :raises: KeyError if the feature does not have a qualifier called + :raises: :py:class:`KeyError` if the feature does not have a qualifier called ``qualifier_name``. """ if qualifier_name not in self.qualifiers: @@ -444,7 +571,7 @@ def _index(self): indexdicts[-1][feature] = [] indexdicts[-1][feature].append({ 'offset': offset, - 'location': Location(line.strip().split()[1]) + 'location': parse_location(line.strip().split()[1]) }) if line.startswith('FEATURES'): in_features = True diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index ed8896c..824c46e 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -3,7 +3,7 @@ import os import propex -from propex.genbank import Location +from propex.genbank import Location, JoinLocation class TestGenBank: @@ -203,6 +203,20 @@ def test_parse_feature(self): assert gbf_inference == ['ab initio prediction:Prodigal:2.60', 'similar to AA sequence:UniProtKB:Q9RVE0'] + def test_feature_join_location(self): + feature = ''' CDS join(52625..53704,54000..55000) + /gene="recF" + /locus_tag="LMG718_02589" + /inference="ab initio prediction:Prodigal:2.60" + /inference="similar to AA sequence:UniProtKB:Q9RVE0" + /codon_start=1 + /transl_table=11''' + gbf = propex.GenBankFeature.from_string('testlocus', feature) + + assert gbf.feature_type == 'CDS' + assert gbf.get_qualifier('gene') == 'recF' + assert len(gbf.get_qualifier('inference')) == 2 + def test_empty_qualifiers(self): feature = ''' CDS complement(52625..53704) /gene="recF" @@ -408,3 +422,73 @@ def test_from_int(self): assert str(Location.from_int(100, 200)) == '100..200' assert str(Location.from_int(100, 200, '-')) == 'complement(100..200)' assert str(Location.from_int(100, strand='-')) == 'complement(100)' + +class TestJoinLocation: + + def setUp(self): + self.jloc1 = JoinLocation('join(1..200,300..400)') + self.jloc2 = JoinLocation('join(1..100, 200..300)') + self.jloc3 = JoinLocation('join(150..175,180..190,310..320)') + self.jloc4 = JoinLocation('join(complement(100),complement(200))') + + def test_instance(self): + assert isinstance(self.jloc1, JoinLocation) + assert isinstance(self.jloc2, JoinLocation) + + @raises(propex.genbank.LocationError) + def test_invalid_location(self): + jloc = JoinLocation('join(1..200,300..400') + + @raises(propex.genbank.LocationError) + def test_invalid_strands(self): + jloc = JoinLocation('join(complement(100),200)') + + def test_start(self): + # Remember, 0-indexed + assert self.jloc1.start == 0 + assert self.jloc2.start == 0 + assert self.jloc3.start == 149 + + def test_end(self): + assert self.jloc1.end == 399 + assert self.jloc2.end == 299 + assert self.jloc3.end == 319 + + def test_loctype(self): + assert self.jloc1.loctype == 'join' + assert self.jloc2.loctype == 'join' + assert self.jloc3.loctype == 'join' + + def test_complement(self): + assert not self.jloc1.is_complement + assert self.jloc4.is_complement + + def test_complement_wrap(self): + jloc = JoinLocation('complement(join(380844..381260,382591..382872))') + assert isinstance(jloc, JoinLocation) + assert jloc.is_complement + assert jloc.start == 380843 + assert jloc.end == 382871 + + def test_overlap(self): + assert self.jloc1.overlaps(self.jloc2) + assert self.jloc1.overlaps(Location('200..300')) + assert self.jloc1.overlaps(Location('150..250')) + assert not self.jloc3.overlaps(self.jloc2) + assert self.jloc3.overlaps(self.jloc1) + assert Location('150..250').overlaps(self.jloc1) + + def test_str(self): + assert str(self.jloc1) == 'join(1..200,300..400)' + assert str(self.jloc2) == 'join(1..100, 200..300)' + + def test_repr(self): + assert repr(self.jloc1) == '' + assert repr(self.jloc2) == '' + + def test_min_distance(self): + assert self.jloc1.min_distance(self.jloc2) == 0 + assert self.jloc1.min_distance(self.jloc3) == 0 + assert self.jloc2.min_distance(self.jloc3) == 10 + assert self.jloc1.min_distance(Location('250')) == 50 + assert Location('250').min_distance(self.jloc1) == 50 From a969830335830de7e9d3c8c0a21bee160dbdacbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sat, 18 Apr 2015 22:46:24 +0200 Subject: [PATCH 17/40] Handle join locations The genbank module is now able to handle join locations, including join locations that are wrapped with complement. The comparison operator is currently a bit misleading since the locations join(complement(...),complement(...)) and complement(join(...,...)) will not be equal. Will probably be fixed in the future. --- propex/genbank.py | 141 +++++++++++++++++++++++++++++++++-- propex/tests/test_genbank.py | 90 +++++++++++++++++++++- 2 files changed, 223 insertions(+), 8 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index 1117d7f..1a2e899 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -6,6 +6,7 @@ """ import collections +import itertools import re from propex.sequence import Sequence @@ -13,6 +14,120 @@ class LocationError(Exception): pass +def parse_location(locstring): + """Parse a location string and return a :py:class:`.Location` + or :py:class:`.JoinLocation` object. + + :param locstring: a GenBank location string. + :raises: :py:class:`.LocationError` if parsing fails. + """ + if locstring.startswith('join') or \ + locstring.startswith('complement(join'): + return JoinLocation(locstring) + else: + return Location(locstring) + +class JoinLocation(object): + + """Represent a "join" GenBank feature location. + + For more information on locations, see + http://www.insdc.org/files/feature_table.html#3.4 + + For information on how locations work, see :py:class:`.Location`. + + :param locstring: a GenBank location string. + :raises: :py:class:`.LocationError` if parsing fails. + """ + + def __init__(self, locstring): + self.locstring = locstring + self.loctype = 'join' + self.locations, self.start, self.end, self.is_complement = \ + self._get_locations() + + def _get_locations(self): + complement_wrap = False + if self.locstring.startswith('complement'): + compmatch = re.match(r'^complement\((join\(.+\))\)$', self.locstring) + if compmatch is None: + raise LocationError('invalid join location: {0}' \ + .format(self.locstring)) + locstring = compmatch.group(1) + complement_wrap = True + else: + locstring = self.locstring + + match = re.match(r'^join\((.+)\)$', locstring) + if match is None: + raise LocationError('invalid join location: {0}' \ + .format(self.locstring)) + locstring = match.group(1) + locations = [Location(x.strip()) for x in locstring.split(',')] + if len(set(x.is_complement for x in locations)) != 1: + raise LocationError('joint location is located on both strands') + start = min(x.start for x in locations) + end = max(x.end for x in locations) + if not complement_wrap: + is_complement = locations[0].is_complement + else: + is_complement = True + return locations, start, end, is_complement + + def overlaps(self, other): + """Test whether the location overlaps with another location. + + :param other: a :py:class:`.Location` or :py:class:`.JoinLocation` + object. + :returns: True if the locations overlap with at least one base, + otherwise False. + """ + if isinstance(other, JoinLocation): + for loc1, loc2 in itertools.product(self.locations, + other.locations): + if loc1.overlaps(loc2): + return True + else: + for loc in self.locations: + if loc.overlaps(other): + return True + return False + + def min_distance(self, other): + """Get the minimum distance to another location. + + :param other: a :py:class:`.Location` or :py:class:`.JoinLocation` + object. + :returns: the minimum distance between the locations. + """ + min_acc = [] + if isinstance(other, JoinLocation): + for loc1, loc2 in itertools.product(self.locations, + other.locations): + d = loc1.min_distance(loc2) + if d == 0: + return 0 + min_acc.append(d) + else: + for loc in self.locations: + d = loc.min_distance(other) + if d == 0: + return 0 + min_acc.append(d) + return min(min_acc) + + def __str__(self): + return self.locstring + + def __eq__(self, other): + return str(self) == str(other) + + def __ne__(self, other): + return not self == other + + def __repr__(self): + return ''.format(repr(self.locstring)) + class Location(object): """Represent a GenBank feature location. @@ -47,6 +162,7 @@ class Location(object): the complement of the sequence. :param locstring: a GenBank location string. + :raises: :py:class:`.LocationError` if parsing fails. """ #: Regular expression for finding complement locations. @@ -77,7 +193,7 @@ def _regex_dict(self): Returns: a dictionary where the keys correspond to the type of - location and the values are the correpsponding regular + location and the values are the corresponding regular expression. """ return { @@ -122,16 +238,27 @@ def _parse(self): return re_name, start - 1, end - 1, is_complement - def overlaps(self, location): + def overlaps(self, other): """Test whether the location overlaps with another location. - :param location: a Location object. + :param other: a :py:class:`.Location` or :py:class:`.JoinLocation` + object. :returns: True if the locations overlap with at least one base, otherwise False. """ - return self.start <= location.end and location.start <= self.end + if isinstance(other, JoinLocation): + return other.overlaps(self) + return self.start <= other.end and other.start <= self.end def min_distance(self, other): + """Get the minimum distance to another location. + + :param other: a :py:class:`.Location` or :py:class:`.JoinLocation` + object. + :returns: the minimum distance between the locations. + """ + if isinstance(other, JoinLocation): + return other.min_distance(self) if self.overlaps(other): return 0 else: @@ -238,14 +365,14 @@ def from_string(cls, locus, feature_string): value = qualifiers[-1][1] + line.strip('"') qualifiers[-1] = (key, value) - return cls(locus, ftype, Location(location), dict(qualifiers)) + return cls(locus, ftype, parse_location(location), dict(qualifiers)) def get_qualifier(self, qualifier_name): """Get a feature qualifier. :param qualifier_name: a string representing a qualifier. :returns: the value of the qualifier. - :raises: KeyError if the feature does not have a qualifier called + :raises: :py:class:`KeyError` if the feature does not have a qualifier called ``qualifier_name``. """ if qualifier_name not in self.qualifiers: @@ -444,7 +571,7 @@ def _index(self): indexdicts[-1][feature] = [] indexdicts[-1][feature].append({ 'offset': offset, - 'location': Location(line.strip().split()[1]) + 'location': parse_location(line.strip().split()[1]) }) if line.startswith('FEATURES'): in_features = True diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index ed8896c..9f24e5e 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -3,7 +3,7 @@ import os import propex -from propex.genbank import Location +from propex.genbank import Location, JoinLocation class TestGenBank: @@ -203,6 +203,20 @@ def test_parse_feature(self): assert gbf_inference == ['ab initio prediction:Prodigal:2.60', 'similar to AA sequence:UniProtKB:Q9RVE0'] + def test_feature_join_location(self): + feature = ''' CDS join(52625..53704,54000..55000) + /gene="recF" + /locus_tag="LMG718_02589" + /inference="ab initio prediction:Prodigal:2.60" + /inference="similar to AA sequence:UniProtKB:Q9RVE0" + /codon_start=1 + /transl_table=11''' + gbf = propex.GenBankFeature.from_string('testlocus', feature) + + assert gbf.feature_type == 'CDS' + assert gbf.get_qualifier('gene') == 'recF' + assert len(gbf.get_qualifier('inference')) == 2 + def test_empty_qualifiers(self): feature = ''' CDS complement(52625..53704) /gene="recF" @@ -408,3 +422,77 @@ def test_from_int(self): assert str(Location.from_int(100, 200)) == '100..200' assert str(Location.from_int(100, 200, '-')) == 'complement(100..200)' assert str(Location.from_int(100, strand='-')) == 'complement(100)' + +class TestJoinLocation: + + def setUp(self): + self.jloc1 = JoinLocation('join(1..200,300..400)') + self.jloc2 = JoinLocation('join(1..100, 200..300)') + self.jloc3 = JoinLocation('join(150..175,180..190,310..320)') + self.jloc4 = JoinLocation('join(complement(100),complement(200))') + + def test_instance(self): + assert isinstance(self.jloc1, JoinLocation) + assert isinstance(self.jloc2, JoinLocation) + + @raises(propex.genbank.LocationError) + def test_invalid_location(self): + jloc = JoinLocation('join(1..200,300..400') + + @raises(propex.genbank.LocationError) + def test_invalid_strands(self): + jloc = JoinLocation('join(complement(100),200)') + + @raises(propex.genbank.LocationError) + def test_messed_up_location(self): + jloc = JoinLocation('complement(join(687..700,800..900,1000..1100))mRNA <687..>3158') + + def test_start(self): + # Remember, 0-indexed + assert self.jloc1.start == 0 + assert self.jloc2.start == 0 + assert self.jloc3.start == 149 + + def test_end(self): + assert self.jloc1.end == 399 + assert self.jloc2.end == 299 + assert self.jloc3.end == 319 + + def test_loctype(self): + assert self.jloc1.loctype == 'join' + assert self.jloc2.loctype == 'join' + assert self.jloc3.loctype == 'join' + + def test_complement(self): + assert not self.jloc1.is_complement + assert self.jloc4.is_complement + + def test_complement_wrap(self): + jloc = JoinLocation('complement(join(380844..381260,382591..382872))') + assert isinstance(jloc, JoinLocation) + assert jloc.is_complement + assert jloc.start == 380843 + assert jloc.end == 382871 + + def test_overlap(self): + assert self.jloc1.overlaps(self.jloc2) + assert self.jloc1.overlaps(Location('200..300')) + assert self.jloc1.overlaps(Location('150..250')) + assert not self.jloc3.overlaps(self.jloc2) + assert self.jloc3.overlaps(self.jloc1) + assert Location('150..250').overlaps(self.jloc1) + + def test_str(self): + assert str(self.jloc1) == 'join(1..200,300..400)' + assert str(self.jloc2) == 'join(1..100, 200..300)' + + def test_repr(self): + assert repr(self.jloc1) == '' + assert repr(self.jloc2) == '' + + def test_min_distance(self): + assert self.jloc1.min_distance(self.jloc2) == 0 + assert self.jloc1.min_distance(self.jloc3) == 0 + assert self.jloc2.min_distance(self.jloc3) == 10 + assert self.jloc1.min_distance(Location('250')) == 50 + assert Location('250').min_distance(self.jloc1) == 50 From 32a8dfcd4f97a562408ae8f114f5e45e86e1cb09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sun, 19 Apr 2015 15:30:14 +0200 Subject: [PATCH 18/40] Handle multiline location strings --- propex/genbank.py | 29 +++++++++++++++++++++++++++-- propex/tests/data/U49845.gb | 6 ++++++ propex/tests/test_genbank.py | 13 +++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index 1a2e899..0249cc0 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -339,10 +339,20 @@ def from_string(cls, locus, feature_string): """ lines = [x.strip() for x in feature_string.splitlines()] ftype, location = lines[0].strip().split() + # Multiline location string + i = 1 + line = lines[i] + while not line.startswith('/'): + i += 1 + location += line + try: + line = lines[i] + except IndexError: + break qualifiers = [] - for line in lines[1:]: + for line in lines[i:]: if line.startswith('/'): # New qualifier i = line.find('=') @@ -569,10 +579,25 @@ def _index(self): features.add(feature) if feature not in indexdicts[-1]: indexdicts[-1][feature] = [] + + locstring = line.strip().split()[1] + + nl = f.next() + loc_offset = offset + len(line) + while len(nl[:21].strip()) == 0 and not nl.strip().startswith('/'): + locstring += nl.strip() + loc_offset += len(nl) + nl = f.next() + + f.seek(loc_offset) + indexdicts[-1][feature].append({ 'offset': offset, - 'location': parse_location(line.strip().split()[1]) + 'location': parse_location(locstring) }) + + offset = loc_offset + continue if line.startswith('FEATURES'): in_features = True offset += len(line) diff --git a/propex/tests/data/U49845.gb b/propex/tests/data/U49845.gb index c053364..a523cdd 100644 --- a/propex/tests/data/U49845.gb +++ b/propex/tests/data/U49845.gb @@ -37,6 +37,12 @@ FEATURES Location/Qualifiers AEVLLRVDNIIRARPRTANRQHM" gene <687..>3158 /gene="AXL2" + gene complement(join(687..700,800..900, + 1000..1100)) + gene complement(join(687..700,800..900, + 1000..1100)) + /gene="testGene" + /product="blargh" mRNA <687..>3158 /gene="AXL2" /product="Axl2p" diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index 9f24e5e..c4f501d 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -217,6 +217,19 @@ def test_feature_join_location(self): assert gbf.get_qualifier('gene') == 'recF' assert len(gbf.get_qualifier('inference')) == 2 + def test_multiline_location(self): + feature = ''' CDS complement(join(1294426..1294992,1294992..1295141, + 1295140..1295322)) + /gene="insZ" + /locus_tag="b4573"''' + gbf = propex.GenBankFeature.from_string('testlocus', feature) + + assert gbf.feature_type == 'CDS' + assert isinstance(gbf.location, JoinLocation) + assert len(gbf.location.locations) == 3 + assert gbf.get_qualifier('gene') == 'insZ' + assert gbf.get_qualifier('locus_tag') == 'b4573' + def test_empty_qualifiers(self): feature = ''' CDS complement(52625..53704) /gene="recF" From 7e545488ac38cc958445d065a2f24d66a1471627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sun, 19 Apr 2015 15:36:44 +0200 Subject: [PATCH 19/40] Test three line location string --- propex/tests/test_genbank.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index c4f501d..cc59c32 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -230,6 +230,18 @@ def test_multiline_location(self): assert gbf.get_qualifier('gene') == 'insZ' assert gbf.get_qualifier('locus_tag') == 'b4573' + feature = ''' CDS complement(join(1294426..1294992, + 1294992..1295141, + 1295140..1295322)) + /gene="insZ" + /locus_tag="b4573"''' + + assert gbf.feature_type == 'CDS' + assert isinstance(gbf.location, JoinLocation) + assert len(gbf.location.locations) == 3 + assert gbf.get_qualifier('gene') == 'insZ' + assert gbf.get_qualifier('locus_tag') == 'b4573' + def test_empty_qualifiers(self): feature = ''' CDS complement(52625..53704) /gene="recF" From f283ab0a7af76923dbc8b8b265fdf74b631c9470 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sun, 19 Apr 2015 15:41:27 +0200 Subject: [PATCH 20/40] Better handling of empty lines This solution is not very pretty, but it works for now. --- propex/genbank.py | 4 ++++ propex/tests/data/U49845.gb | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index 0249cc0..e2c830c 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -714,9 +714,13 @@ def __getitem__(self, index): f.seek(feature['offset']) feature_string = f.readline() line = f.readline() + while len(line) < 6: + line = f.readline() while line[5] == ' ': feature_string += line line = f.readline() + while len(line) < 6: + line = f.readline() features[ftype].append( GenBankFeature.from_string(locus_index['name'], feature_string)) diff --git a/propex/tests/data/U49845.gb b/propex/tests/data/U49845.gb index a523cdd..47979fb 100644 --- a/propex/tests/data/U49845.gb +++ b/propex/tests/data/U49845.gb @@ -68,7 +68,7 @@ FEATURES Location/Qualifiers YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL VDFSNKSNVNVGQVKDIHGRIPEML" - + gene complement(<3300..>4037) /gene="REV7" mRNA complement(<3300..>4037) @@ -85,7 +85,7 @@ FEATURES Location/Qualifiers KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK LISGDDKILNGVYSQYEEGESIFGSLF" -ORIGIN +ORIGIN 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa From e10646f8f9556f824627775ddc10f8e14727d25e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20M=C3=A4hler?= Date: Sun, 19 Apr 2015 16:07:53 +0200 Subject: [PATCH 21/40] Fix duplicate qualifier bug --- propex/genbank.py | 11 +++++++++-- propex/tests/test_genbank.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/propex/genbank.py b/propex/genbank.py index e2c830c..b0ddcdc 100644 --- a/propex/genbank.py +++ b/propex/genbank.py @@ -366,13 +366,20 @@ def from_string(cls, locus, feature_string): if len(qualifiers) > 0 and key == qualifiers[-1][0]: # Multiple qualifiers with the same key - qualifiers[-1] = (key, [qualifiers[-1][1], value]) + if isinstance(qualifiers[-1][1], list): + qualifiers[-1][1].append(value) + else: + qualifiers[-1] = (key, [qualifiers[-1][1], value]) else: qualifiers.append((key, value)) else: # Continuation of qualifier key = qualifiers[-1][0] - value = qualifiers[-1][1] + line.strip('"') + if isinstance(qualifiers[-1][1], list): + value = qualifiers[-1][1] + value[-1] += ' ' + line.strip('"') + else: + value = qualifiers[-1][1] + ' ' + line.strip('"') qualifiers[-1] = (key, value) return cls(locus, ftype, parse_location(location), dict(qualifiers)) diff --git a/propex/tests/test_genbank.py b/propex/tests/test_genbank.py index cc59c32..e267239 100644 --- a/propex/tests/test_genbank.py +++ b/propex/tests/test_genbank.py @@ -203,6 +203,34 @@ def test_parse_feature(self): assert gbf_inference == ['ab initio prediction:Prodigal:2.60', 'similar to AA sequence:UniProtKB:Q9RVE0'] + def test_multiple_qualifiers(self): + feature = ''' ncRNA 476448..476561 + /ncRNA_class="SRP_RNA" + /gene="ffs" + /locus_tag="b0455" + /gene_synonym="ECK0449" + /gene_synonym="JWR0009" + /product="4.5S sRNA component of Signal Recognition + Particle (SRP)" + /note="4.5S RNA; component of ribonucleoprotein particle; + works with the Ffh protein; + adjusted endpoints to reflect the mature 4.5S RNA (114 + nt)" + /function="2.2.6 information transfer; RNA related; rRNA, + stable RNA" + /function="2.3.2 information transfer; protein related; + translation" + /function="7.1 location of gene products; cytoplasm" + /function="component of Signal Recognition Particle (SRP) + with the Ffh protein; involved in co-translational + targeting of proteins to membranes" + /function="RNA; Ribosomal and stable RNAs" + /db_xref="ASAP:ABE-0001579" + /db_xref="EcoGene:EG30027"''' + gbf = propex.GenBankFeature.from_string('testlocus', feature) + func = gbf.get_qualifier('function') + assert len(func) == 5 + def test_feature_join_location(self): feature = ''' CDS join(52625..53704,54000..55000) /gene="recF" From c2071b2e7da99ffa56c9139b53c8c00b3133e801 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 19 Apr 2015 18:57:09 +0200 Subject: [PATCH 22/40] Change name from propex to seqpoet, close #7 I will update the name of the GitHub repo when I merge these changes onto the master branch. --- bin/{propex => seqpoet} | 28 ++++---- {propex => seqpoet}/__init__.py | 0 {propex => seqpoet}/fasta.py | 2 +- {propex => seqpoet}/genbank.py | 2 +- {propex => seqpoet}/search.py | 0 {propex => seqpoet}/sequence.py | 0 {propex => seqpoet}/tests/data/U49845.gb | 0 {propex => seqpoet}/tests/data/dups.fasta | 0 {propex => seqpoet}/tests/data/dups.fasta.fai | 0 .../tests/data/dups_noindex.fasta | 0 .../tests/data/empty.fasta.fai | 0 .../tests/data/empty_sequence.fasta | 0 .../tests/data/sample_primers.txt | 0 .../tests/data/sample_sequence.fa | 0 {propex => seqpoet}/tests/data/uneven.fasta | 0 .../tests/data/valid_index.fasta | 0 .../tests/data/valid_index.fasta.fai | 0 .../tests/data/valid_noindex.fasta | 0 {propex => seqpoet}/tests/test_fasta.py | 58 +++++++-------- {propex => seqpoet}/tests/test_genbank.py | 72 +++++++++---------- {propex => seqpoet}/tests/test_search.py | 8 +-- {propex => seqpoet}/tests/test_sequence.py | 22 +++--- setup.py | 16 ++--- 23 files changed, 104 insertions(+), 104 deletions(-) rename bin/{propex => seqpoet} (93%) rename {propex => seqpoet}/__init__.py (100%) rename {propex => seqpoet}/fasta.py (99%) rename {propex => seqpoet}/genbank.py (99%) rename {propex => seqpoet}/search.py (100%) rename {propex => seqpoet}/sequence.py (100%) rename {propex => seqpoet}/tests/data/U49845.gb (100%) rename {propex => seqpoet}/tests/data/dups.fasta (100%) rename {propex => seqpoet}/tests/data/dups.fasta.fai (100%) rename {propex => seqpoet}/tests/data/dups_noindex.fasta (100%) rename {propex => seqpoet}/tests/data/empty.fasta.fai (100%) rename {propex => seqpoet}/tests/data/empty_sequence.fasta (100%) rename {propex => seqpoet}/tests/data/sample_primers.txt (100%) rename {propex => seqpoet}/tests/data/sample_sequence.fa (100%) rename {propex => seqpoet}/tests/data/uneven.fasta (100%) rename {propex => seqpoet}/tests/data/valid_index.fasta (100%) rename {propex => seqpoet}/tests/data/valid_index.fasta.fai (100%) rename {propex => seqpoet}/tests/data/valid_noindex.fasta (100%) rename {propex => seqpoet}/tests/test_fasta.py (79%) rename {propex => seqpoet}/tests/test_genbank.py (91%) rename {propex => seqpoet}/tests/test_search.py (94%) rename {propex => seqpoet}/tests/test_sequence.py (72%) diff --git a/bin/propex b/bin/seqpoet similarity index 93% rename from bin/propex rename to bin/seqpoet index 7fe112f..5528e67 100644 --- a/bin/propex +++ b/bin/seqpoet @@ -5,12 +5,12 @@ import itertools import os import sys -import propex +import seqpoet def get_probe(fname): with open(fname) as f: try: - seqs = [propex.sequence.Sequence(line.strip()) for line in f \ + seqs = [seqpoet.sequence.Sequence(line.strip()) for line in f \ if len(line.strip()) > 0] except ValueError: print('ERROR: probe file does not contain valid sequences', @@ -30,14 +30,14 @@ def get_single_sequence(fname, genbank_only=False, stop_on_error=False): genbank_success = False fasta_success = False try: - seq = propex.GenBank(fname) + seq = seqpoet.GenBank(fname) genbank_success = True - except propex.genbank.ParsingError: + except seqpoet.genbank.ParsingError: pass if not genbank_success and not genbank_only: try: - seq = propex.Fasta(fname) + seq = seqpoet.Fasta(fname) fasta_success = True except ValueError: pass @@ -100,10 +100,10 @@ def match_probe(probe, seqs, mismatches=2): pl = len(probe) for f in seqs.itervalues(): for i, record in enumerate(f): - res1 = propex.search.search(str(probe), str(record.seq), + res1 = seqpoet.search.search(str(probe), str(record.seq), mismatches=mismatches) res2 = [len(record.seq) - x - pl for x in \ - propex.search.search(str(probe), str(record.seq.revcomp()), + seqpoet.search.search(str(probe), str(record.seq.revcomp()), mismatches=mismatches)] if len(res1) > 0: @@ -143,16 +143,16 @@ def match_primer(primers, seqs, mismatches=2, pl2 = len(primers[1]) for f in seqs.itervalues(): for i, record in enumerate(f): - res1_1 = propex.search.search(str(primers[0]), str(record.seq), + res1_1 = seqpoet.search.search(str(primers[0]), str(record.seq), mismatches=mismatches) res1_2 = [len(record.seq) - x - pl1 for x in \ - propex.search.search(str(primers[0]), str(record.seq.revcomp()), + seqpoet.search.search(str(primers[0]), str(record.seq.revcomp()), mismatches=mismatches)] - res2_1 = propex.search.search(str(primers[1]), str(record.seq), + res2_1 = seqpoet.search.search(str(primers[1]), str(record.seq), mismatches=mismatches) res2_2 = [len(record.seq) - x - pl2 for x in \ - propex.search.search(str(primers[1]), str(record.seq.revcomp()), + seqpoet.search.search(str(primers[1]), str(record.seq.revcomp()), mismatches=mismatches)] # Match res1_1 with res2_2 and res2_1 with res1_2 to get primer @@ -208,7 +208,7 @@ def find_operon(matches, seqs, max_distance=500): for m in matches: gb = seqs[m['filename']] locus = gb[m['seqindex']] - location = propex.genbank.Location.from_int(m['hitstart'], m['hitend']) + location = seqpoet.genbank.Location.from_int(m['hitstart'], m['hitend']) features = locus.features_at_location(location) if len(features) == 0: print('WARNING: no gene for match in locus {0}'.format(m['seqname']), @@ -268,7 +268,7 @@ def write_fasta(matches, filename=sys.stdout): for m in matches: m['filename'] = os.path.basename(m['filename']) - s = propex.fasta.FastaRecord(m['seq'], + s = seqpoet.fasta.FastaRecord(m['seq'], '{filename}:{seqname}:{hitstart}:{hitend}:{length}:{strand}' \ .format(**m)) print(s, file=f) @@ -311,7 +311,7 @@ def parse_args(): default=sys.stdout) parser.add_argument('--version', help=('print version and exit'), - action='version', version='%(prog)s v{0}'.format(propex.__version__)) + action='version', version='%(prog)s v{0}'.format(seqpoet.__version__)) args = parser.parse_args() diff --git a/propex/__init__.py b/seqpoet/__init__.py similarity index 100% rename from propex/__init__.py rename to seqpoet/__init__.py diff --git a/propex/fasta.py b/seqpoet/fasta.py similarity index 99% rename from propex/fasta.py rename to seqpoet/fasta.py index a86de1b..eb0d50e 100644 --- a/propex/fasta.py +++ b/seqpoet/fasta.py @@ -7,7 +7,7 @@ import os import textwrap -from propex.sequence import Sequence +from seqpoet.sequence import Sequence class FastaIndex(object): """Represents an index for a FASTA file. diff --git a/propex/genbank.py b/seqpoet/genbank.py similarity index 99% rename from propex/genbank.py rename to seqpoet/genbank.py index b0ddcdc..9f17a3b 100644 --- a/propex/genbank.py +++ b/seqpoet/genbank.py @@ -9,7 +9,7 @@ import itertools import re -from propex.sequence import Sequence +from seqpoet.sequence import Sequence class LocationError(Exception): pass diff --git a/propex/search.py b/seqpoet/search.py similarity index 100% rename from propex/search.py rename to seqpoet/search.py diff --git a/propex/sequence.py b/seqpoet/sequence.py similarity index 100% rename from propex/sequence.py rename to seqpoet/sequence.py diff --git a/propex/tests/data/U49845.gb b/seqpoet/tests/data/U49845.gb similarity index 100% rename from propex/tests/data/U49845.gb rename to seqpoet/tests/data/U49845.gb diff --git a/propex/tests/data/dups.fasta b/seqpoet/tests/data/dups.fasta similarity index 100% rename from propex/tests/data/dups.fasta rename to seqpoet/tests/data/dups.fasta diff --git a/propex/tests/data/dups.fasta.fai b/seqpoet/tests/data/dups.fasta.fai similarity index 100% rename from propex/tests/data/dups.fasta.fai rename to seqpoet/tests/data/dups.fasta.fai diff --git a/propex/tests/data/dups_noindex.fasta b/seqpoet/tests/data/dups_noindex.fasta similarity index 100% rename from propex/tests/data/dups_noindex.fasta rename to seqpoet/tests/data/dups_noindex.fasta diff --git a/propex/tests/data/empty.fasta.fai b/seqpoet/tests/data/empty.fasta.fai similarity index 100% rename from propex/tests/data/empty.fasta.fai rename to seqpoet/tests/data/empty.fasta.fai diff --git a/propex/tests/data/empty_sequence.fasta b/seqpoet/tests/data/empty_sequence.fasta similarity index 100% rename from propex/tests/data/empty_sequence.fasta rename to seqpoet/tests/data/empty_sequence.fasta diff --git a/propex/tests/data/sample_primers.txt b/seqpoet/tests/data/sample_primers.txt similarity index 100% rename from propex/tests/data/sample_primers.txt rename to seqpoet/tests/data/sample_primers.txt diff --git a/propex/tests/data/sample_sequence.fa b/seqpoet/tests/data/sample_sequence.fa similarity index 100% rename from propex/tests/data/sample_sequence.fa rename to seqpoet/tests/data/sample_sequence.fa diff --git a/propex/tests/data/uneven.fasta b/seqpoet/tests/data/uneven.fasta similarity index 100% rename from propex/tests/data/uneven.fasta rename to seqpoet/tests/data/uneven.fasta diff --git a/propex/tests/data/valid_index.fasta b/seqpoet/tests/data/valid_index.fasta similarity index 100% rename from propex/tests/data/valid_index.fasta rename to seqpoet/tests/data/valid_index.fasta diff --git a/propex/tests/data/valid_index.fasta.fai b/seqpoet/tests/data/valid_index.fasta.fai similarity index 100% rename from propex/tests/data/valid_index.fasta.fai rename to seqpoet/tests/data/valid_index.fasta.fai diff --git a/propex/tests/data/valid_noindex.fasta b/seqpoet/tests/data/valid_noindex.fasta similarity index 100% rename from propex/tests/data/valid_noindex.fasta rename to seqpoet/tests/data/valid_noindex.fasta diff --git a/propex/tests/test_fasta.py b/seqpoet/tests/test_fasta.py similarity index 79% rename from propex/tests/test_fasta.py rename to seqpoet/tests/test_fasta.py index e844609..f3c9d31 100644 --- a/propex/tests/test_fasta.py +++ b/seqpoet/tests/test_fasta.py @@ -2,7 +2,7 @@ from nose.plugins.skip import SkipTest import os -import propex +import seqpoet class TestFastaIndex: @@ -13,61 +13,61 @@ def setUp(self): self.empty_index = os.path.join(testdir, 'data', 'empty.fasta.fai') def test_faidx_length(self): - faidx = propex.FastaIndex(self.valid_index) + faidx = seqpoet.FastaIndex(self.valid_index) assert len(faidx) == 4 def test_faidx_order(self): - faidx = propex.FastaIndex(self.valid_index) + faidx = seqpoet.FastaIndex(self.valid_index) assert faidx[0]['name'] == 'seq1' assert faidx[1]['name'] == 'seq2' assert faidx[2]['name'] == 'aaa' assert faidx[3]['name'] == 'bbb' def test_str(self): - faidx = propex.FastaIndex(self.valid_index) + faidx = seqpoet.FastaIndex(self.valid_index) assert str(faidx) == '\n'.join(['seq1\t78\t6\t28\t29', 'seq2\t28\t93\t28\t29', 'aaa\t44\t127\t28\t29', 'bbb\t73\t178\t28\t29']) def test_keys(self): - faidx = propex.FastaIndex(self.valid_index) + faidx = seqpoet.FastaIndex(self.valid_index) assert faidx.keys() == ['seq1', 'seq2', 'aaa', 'bbb'] def test_iter(self): - faidx = propex.FastaIndex(self.valid_index) + faidx = seqpoet.FastaIndex(self.valid_index) for k, v in faidx: pass def test_repr(self): - faidx = propex.FastaIndex(self.valid_index) + faidx = seqpoet.FastaIndex(self.valid_index) assert repr(faidx) == '' \ .format(os.path.splitext(self.valid_index)[0]) @raises(ValueError) def test_nonexisting_index(self): - faidx = propex.FastaIndex(self.invalid_index) + faidx = seqpoet.FastaIndex(self.invalid_index) @raises(ValueError) def test_empty_index(self): - faidx = propex.FastaIndex(self.empty_index) + faidx = seqpoet.FastaIndex(self.empty_index) @raises(ValueError) def test_incorrect_filetype(self): - faidx = propex.FastaIndex(os.path.splitext(self.invalid_index)[0]) + faidx = seqpoet.FastaIndex(os.path.splitext(self.invalid_index)[0]) class TestFastaRecord: def test_string_sequence(self): - fr = propex.FastaRecord('accaggata', 'test') + fr = seqpoet.FastaRecord('accaggata', 'test') @raises(ValueError) def test_invalid_sequence(self): - fr = propex.FastaRecord('thisisnotdna', 'test') + fr = seqpoet.FastaRecord('thisisnotdna', 'test') @raises(TypeError) def test_wrong_sequence_type(self): - fr = propex.FastaRecord(['a', 'c', 'g', 't'], 'test') + fr = seqpoet.FastaRecord(['a', 'c', 'g', 't'], 'test') class TestFasta: @@ -78,11 +78,11 @@ def setUp(self): self.dups_fname = os.path.join(testdir, 'data', 'dups.fasta') def test_fasta_length(self): - fasta = propex.Fasta(self.valid_index) + fasta = seqpoet.Fasta(self.valid_index) assert len(fasta) == 4, 'unexpected number of sequences' def test_fasta_headers(self): - fasta = propex.Fasta(self.valid_index) + fasta = seqpoet.Fasta(self.valid_index) headers = ['seq1', 'seq2', 'aaa', 'bbb'] for i, record in enumerate(fasta): assert record.name == headers[i], \ @@ -91,14 +91,14 @@ def test_fasta_headers(self): 'spaces in sequence' def test_sequence_length(self): - fasta = propex.Fasta(self.valid_index) + fasta = seqpoet.Fasta(self.valid_index) lens = [78, 28, 44, 73] for i, record in enumerate(fasta): assert len(record) == lens[i], \ 'sequence length ({0}) is not {1}'.format(len(record), lens[i]) def test_indexing(self): - fasta = propex.Fasta(self.valid_index) + fasta = seqpoet.Fasta(self.valid_index) assert len(fasta[1]) == 28 assert fasta[1].seq == 'cacaggaggatagaccagatgacagata' assert repr(fasta[1]) == ' (28 nt)>', \ @@ -106,12 +106,12 @@ def test_indexing(self): @raises(IndexError) def test_invalid_index(self): - fasta = propex.Fasta(self.valid_index) + fasta = seqpoet.Fasta(self.valid_index) fasta[4] @raises(ValueError) def test_parse_duplicate_fasta(self): - fasta = propex.Fasta(self.dups_fname) + fasta = seqpoet.Fasta(self.dups_fname) class TestFastaWithoutIndex: @@ -125,11 +125,11 @@ def tearDown(self): os.unlink(self.valid_noindex + '.fai') def test_fasta_length(self): - fasta = propex.Fasta(self.valid_noindex) + fasta = seqpoet.Fasta(self.valid_noindex) assert len(fasta) == 4, 'found {0} seqs, expected 4'.format(len(fasta)) def test_fasta_headers(self): - fasta = propex.Fasta(self.valid_noindex) + fasta = seqpoet.Fasta(self.valid_noindex) headers = ['seq1', 'seq2', 'aaa', 'bbb'] for i, record in enumerate(fasta): assert record.name == headers[i], \ @@ -138,14 +138,14 @@ def test_fasta_headers(self): 'spaces in sequence' def test_sequence_length(self): - fasta = propex.Fasta(self.valid_noindex) + fasta = seqpoet.Fasta(self.valid_noindex) lens = [78, 28, 44, 73] for i, record in enumerate(fasta): assert len(record) == lens[i], \ 'sequence length ({0}) is not {1}'.format(len(record), lens[i]) def test_record_repr(self): - fasta = propex.Fasta(self.valid_noindex) + fasta = seqpoet.Fasta(self.valid_noindex) headers = ['seq1', 'seq2', 'aaa', 'bbb'] seqs = ['actaa', 'cacag', 'actga', 'acatc'] lens = [78, 28, 44, 73] @@ -156,7 +156,7 @@ def test_record_repr(self): @raises(ValueError) def test_duplicate_headers(self): - fasta = propex.Fasta(self.dups_noindex) + fasta = seqpoet.Fasta(self.dups_noindex) class TestInvalidFasta: @@ -168,10 +168,10 @@ def setUp(self): 'operon_extractor', 'data_genbank', 'LMG718-cremoris.gb') def test_empty_sequence(self): - fasta = propex.Fasta(self.empty_sequence) + fasta = seqpoet.Fasta(self.empty_sequence) def test_fasta_headers(self): - fasta = propex.Fasta(self.empty_sequence) + fasta = seqpoet.Fasta(self.empty_sequence) headers = ['seq1', 'seq2', 'empty', 'aaa', 'bbb'] for i, record in enumerate(fasta): assert record.name == headers[i], \ @@ -180,7 +180,7 @@ def test_fasta_headers(self): 'spaces in sequence' def test_sequence_length(self): - fasta = propex.Fasta(self.empty_sequence) + fasta = seqpoet.Fasta(self.empty_sequence) lens = [78, 28, 0, 44, 73] for i, record in enumerate(fasta): assert len(record) == lens[i], \ @@ -190,8 +190,8 @@ def test_sequence_length(self): def test_genbank(self): if not os.path.isfile(self.gb): raise SkipTest - fasta = propex.Fasta(self.gb) + fasta = seqpoet.Fasta(self.gb) @raises(ValueError) def test_uneven_rows(self): - fasta = propex.Fasta(self.uneven) + fasta = seqpoet.Fasta(self.uneven) diff --git a/propex/tests/test_genbank.py b/seqpoet/tests/test_genbank.py similarity index 91% rename from propex/tests/test_genbank.py rename to seqpoet/tests/test_genbank.py index e267239..d9f6593 100644 --- a/propex/tests/test_genbank.py +++ b/seqpoet/tests/test_genbank.py @@ -2,15 +2,15 @@ from nose.plugins.skip import SkipTest import os -import propex -from propex.genbank import Location, JoinLocation +import seqpoet +from seqpoet.genbank import Location, JoinLocation class TestGenBank: def setUp(self): self.testdir = os.path.dirname(__file__) self.sc = os.path.join(self.testdir, 'data', 'U49845.gb') - self.gb = propex.GenBank(self.sc) + self.gb = seqpoet.GenBank(self.sc) def test_sequence_length(self): assert len(self.gb[0].seq) == 5028 @@ -56,29 +56,29 @@ def setUp(self): self.lmga18 = os.path.join(self.genbankdir, 'LMGA18-cremoris.gb') def test_index_length(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) assert len(gb) == 251, 'unexpected number of loci: {0}'.format(len(gb)) def test_duplicate_locus_length(self): - gb = propex.GenBank(self.lmga18) + gb = seqpoet.GenBank(self.lmga18) assert len(gb) == 231, 'unexpected number of loci: {0}'.format(len(gb)) def test_sequence_length(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) assert len(gb[0].seq) == 1522 def test_iteration(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) for locus in gb: pass def test_load_directory(self): - gbs = [propex.GenBank(os.path.join(self.genbankdir, x)) \ + gbs = [seqpoet.GenBank(os.path.join(self.genbankdir, x)) \ for x in os.listdir(self.genbankdir) \ if os.path.isfile(os.path.join(self.genbankdir, x))] def test_features_at_location(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) locus = gb.get_locus_from_name('718_Contig_100_c')[0] f = locus.features_at_location(Location('800')) assert len(f) == 1, 'found {0} features, expected 1'.format(len(f)) @@ -95,39 +95,39 @@ def test_features_at_location(self): assert f[1].get_qualifier('locus_tag') == 'LMG718_00020' def test_get_locus_from_name(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) loci = gb.get_locus_from_name('718_Contig_106_c') assert len(loci) > 0 assert len(loci[0].seq) == 8967 - @raises(propex.genbank.ParsingError) + @raises(seqpoet.genbank.ParsingError) def test_parse_fasta(self): - gb = propex.GenBank(os.path.join(self.genbankdir, '..', 'data_fasta', + gb = seqpoet.GenBank(os.path.join(self.genbankdir, '..', 'data_fasta', 'LMG718-cremoris.fasta')) def test_next_downstream(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) locus = gb.get_locus_from_name('718_Contig_10_co')[0] gbf = locus.features_at_location(Location('1355'))[0] next = locus.next_downstream(gbf) assert str(next.location) == '2532..2819' def test_next_downstream_duplicate_loci(self): - gb = propex.GenBank(self.lmga18) + gb = seqpoet.GenBank(self.lmga18) locus = gb.get_locus_from_name('LMGA18_Contig_10')[1] gbf = locus.features_at_location(Location('301'))[0] next = locus.next_downstream(gbf) assert str(next.location) == '3180..3404' def test_next_downstream_last(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) locus = gb.get_locus_from_name('718_Contig_102_c')[0] gbf = locus.features_at_location(Location('9765'))[0] next = locus.next_downstream(gbf) assert next is None def test_next_downstream_complement(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) locus = gb.get_locus_from_name('718_Contig_101_c')[0] gbf = locus.features_at_location(Location('7664'))[0] next = locus.next_downstream(gbf) @@ -138,28 +138,28 @@ def test_next_downstream_complement(self): assert str(next.location) == 'complement(2752..5457)' def test_next_upstream(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) locus = gb.get_locus_from_name('718_Contig_106_c')[0] gbf = locus.features_at_location(Location('754'))[0] next = locus.next_upstream(gbf) assert str(next.location) == '58..747' def test_next_upstream_duplicate_loci(self): - gb = propex.GenBank(self.lmga18) + gb = seqpoet.GenBank(self.lmga18) locus = gb.get_locus_from_name('LMGA18_Contig_10')[1] gbf = locus.features_at_location(Location('3180'))[0] next = locus.next_upstream(gbf) assert str(next.location) == '301..1245' def test_next_upstream_last(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) locus = gb.get_locus_from_name('718_Contig_106_c')[0] gbf = locus.features_at_location(Location('58'))[0] next = locus.next_upstream(gbf) assert next is None def test_next_upstream_complement(self): - gb = propex.GenBank(self.lmg718) + gb = seqpoet.GenBank(self.lmg718) locus = gb.get_locus_from_name('718_Contig_106_c')[0] gbf = locus.features_at_location(Location('7161'))[0] next = locus.next_upstream(gbf) @@ -173,7 +173,7 @@ class TestGenBankFeature: def test_qualifier_names(self): f = {'name': 'lalala'} - gbf = propex.GenBankFeature('testlocus', 'CDS', '123..679', f) + gbf = seqpoet.GenBankFeature('testlocus', 'CDS', '123..679', f) assert gbf.get_qualifier('name') == f['name'], \ 'wrong name: {0}'.format(gbf.get_qualifier('name')) @@ -193,7 +193,7 @@ def test_parse_feature(self): LKITYNQNVKTDFSKELLSRQDHDIFRHQTTVGPHRDDLQFFINEINVADFGSQGQQR TVTLSIKLAEIDLIFEETGEYPILLLDDVMSELDNHRQLDLIETSLGKTQTFITTTTL DHLKNLPENLSIFHVTDGTIEKEKE"''' - gbf = propex.GenBankFeature.from_string('testlocus', feature) + gbf = seqpoet.GenBankFeature.from_string('testlocus', feature) assert gbf.feature_type == 'CDS' gbf_gene = gbf.get_qualifier('gene') @@ -227,7 +227,7 @@ def test_multiple_qualifiers(self): /function="RNA; Ribosomal and stable RNAs" /db_xref="ASAP:ABE-0001579" /db_xref="EcoGene:EG30027"''' - gbf = propex.GenBankFeature.from_string('testlocus', feature) + gbf = seqpoet.GenBankFeature.from_string('testlocus', feature) func = gbf.get_qualifier('function') assert len(func) == 5 @@ -239,7 +239,7 @@ def test_feature_join_location(self): /inference="similar to AA sequence:UniProtKB:Q9RVE0" /codon_start=1 /transl_table=11''' - gbf = propex.GenBankFeature.from_string('testlocus', feature) + gbf = seqpoet.GenBankFeature.from_string('testlocus', feature) assert gbf.feature_type == 'CDS' assert gbf.get_qualifier('gene') == 'recF' @@ -250,7 +250,7 @@ def test_multiline_location(self): 1295140..1295322)) /gene="insZ" /locus_tag="b4573"''' - gbf = propex.GenBankFeature.from_string('testlocus', feature) + gbf = seqpoet.GenBankFeature.from_string('testlocus', feature) assert gbf.feature_type == 'CDS' assert isinstance(gbf.location, JoinLocation) @@ -276,7 +276,7 @@ def test_empty_qualifiers(self): /locus_tag= /note /random=""''' - gbf = propex.GenBankFeature.from_string('testlocus', feature) + gbf = seqpoet.GenBankFeature.from_string('testlocus', feature) assert gbf.get_qualifier('locus_tag') == '' assert gbf.get_qualifier('note') is None @@ -286,22 +286,22 @@ def test_empty_qualifiers(self): def test_missing_qualifier(self): feature = ''' CDS complement(52625..53704) /gene="recF"''' - gbf = propex.GenBankFeature.from_string('testlocus', feature) + gbf = seqpoet.GenBankFeature.from_string('testlocus', feature) gbf.get_qualifier('locus_tag') def test_empty_qualifiers(self): - gbf = propex.GenBankFeature('testlocus', 'CDS', '123..679') + gbf = seqpoet.GenBankFeature('testlocus', 'CDS', '123..679') assert isinstance(gbf.qualifiers, list) assert len(gbf.qualifiers) == 0 def test_equality(self): - gbf1 = propex.GenBankFeature('testlocus', 'CDS', + gbf1 = seqpoet.GenBankFeature('testlocus', 'CDS', Location('123..679'), {'name': 'randomname'}) - gbf2 = propex.GenBankFeature('testlocus', 'CDS', + gbf2 = seqpoet.GenBankFeature('testlocus', 'CDS', Location('123..679'), {'name': 'randomname'}) - gbf3 = propex.GenBankFeature('testlocus', 'CDS', + gbf3 = seqpoet.GenBankFeature('testlocus', 'CDS', Location('123..679'), {'name': 'otherrandomname'}) - gbf4 = propex.GenBankFeature('testlocus', 'CDS', + gbf4 = seqpoet.GenBankFeature('testlocus', 'CDS', Location('120..679'), {'name': 'randomname'}) assert gbf1 == gbf2 @@ -443,7 +443,7 @@ def test_overlap(self): assert not loc4.overlaps(loc3) assert not loc1.overlaps(loc5) - @raises(propex.genbank.LocationError) + @raises(seqpoet.genbank.LocationError) def test_invalid_location(self): loc = Location('123..noloc') @@ -488,15 +488,15 @@ def test_instance(self): assert isinstance(self.jloc1, JoinLocation) assert isinstance(self.jloc2, JoinLocation) - @raises(propex.genbank.LocationError) + @raises(seqpoet.genbank.LocationError) def test_invalid_location(self): jloc = JoinLocation('join(1..200,300..400') - @raises(propex.genbank.LocationError) + @raises(seqpoet.genbank.LocationError) def test_invalid_strands(self): jloc = JoinLocation('join(complement(100),200)') - @raises(propex.genbank.LocationError) + @raises(seqpoet.genbank.LocationError) def test_messed_up_location(self): jloc = JoinLocation('complement(join(687..700,800..900,1000..1100))mRNA <687..>3158') diff --git a/propex/tests/test_search.py b/seqpoet/tests/test_search.py similarity index 94% rename from propex/tests/test_search.py rename to seqpoet/tests/test_search.py index a985c9b..e646d1c 100644 --- a/propex/tests/test_search.py +++ b/seqpoet/tests/test_search.py @@ -4,10 +4,10 @@ from nose.tools import raises from nose.plugins.skip import SkipTest -from propex.search import search, hamming_distance -from propex import Sequence -from propex import GenBank -from propex.genbank import Location +from seqpoet.search import search, hamming_distance +from seqpoet import Sequence +from seqpoet import GenBank +from seqpoet.genbank import Location class TestHammingDistance: diff --git a/propex/tests/test_sequence.py b/seqpoet/tests/test_sequence.py similarity index 72% rename from propex/tests/test_sequence.py rename to seqpoet/tests/test_sequence.py index 3fcbed7..33e38fe 100644 --- a/propex/tests/test_sequence.py +++ b/seqpoet/tests/test_sequence.py @@ -3,7 +3,7 @@ from nose.tools import raises -import propex +import seqpoet class TestSequence: @@ -12,42 +12,42 @@ def setup(self): self.illegal = 'agagcatgcacthisisnotcorrect' def test_sequence_length(self): - s = propex.Sequence(self.seq1) + s = seqpoet.Sequence(self.seq1) assert len(s) == len(self.seq1) def test_casing(self): - s = propex.Sequence(self.seq1) + s = seqpoet.Sequence(self.seq1) assert re.match('^[acgt]+$', str(s)) def test_reverse_complement(self): - s = propex.Sequence(self.seq1) - s2 = propex.Sequence('acct') + s = seqpoet.Sequence(self.seq1) + s2 = seqpoet.Sequence('acct') assert s.revcomp() == 'tatgtgtctctattctgtgtatgt', \ '"{0}" is not "tatgtgtctctattctgtgtatgt"'.format(s.revcomp().seq) assert s2.revcomp() == 'aggt', \ '"{0}" is not "aggt"'.format(s2.revcomp().seq) def test_str(self): - s = propex.Sequence(self.seq1) + s = seqpoet.Sequence(self.seq1) assert str(s) == self.seq1.lower() def test_repr(self): - s = propex.Sequence(self.seq1) + s = seqpoet.Sequence(self.seq1) assert repr(s) == '' assert repr(s.revcomp()) == '' def test_indexing(self): - s = propex.Sequence(self.seq1) + s = seqpoet.Sequence(self.seq1) assert s[4] == 'a' assert s[:5] == 'acata' assert s[-6:] == 'cacata' assert s[4:8] == 'acac' def test_equality(self): - s = propex.Sequence(self.seq1) + s = seqpoet.Sequence(self.seq1) assert s == self.seq1.lower() - assert s[:3] == propex.Sequence(self.seq1[:3]) + assert s[:3] == seqpoet.Sequence(self.seq1[:3]) @raises(ValueError) def test_illegal_characters(self): - s = propex.Sequence(self.illegal) + s = seqpoet.Sequence(self.illegal) diff --git a/setup.py b/setup.py index 028d288..0cc2cf5 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ if sys.version_info[0] < 2 or \ sys.version_info[0] == 2 and sys.version_info[1] < 7: - sys.stderr.write('Error in propex setup\n') - sys.stderr.write('You need at least version 2.7 of Python to use propex\n') + sys.stderr.write('Error in seqpoet setup\n') + sys.stderr.write('You need at least version 2.7 of Python to use seqpoet\n') sys.exit(1) if sys.version_info[0] >= 3: - sys.stderr.write('Error in propex setup\n') + sys.stderr.write('Error in seqpoet setup\n') sys.stderr.write('This package only works with Python 2 at the moment\n') sys.stderr.write('Please use Python 2.x, x >= 7\n') sys.exit(1) @@ -33,20 +33,20 @@ def find_version(*file_paths): return version_match.group(1) raise RuntimeError("Unable to find version string.") -setup(name='propex', - version=find_version('propex/__init__.py'), +setup(name='seqpoet', + version=find_version('seqpoet/__init__.py'), description='In silico PCR and operon extraction', - url='https://github.com/maehler/propex', + url='https://github.com/maehler/seqpoet', author='Niklas Mähler', author_email='niklas.mahler@gmail.com', maintainer='Niklas Mähler', maintainer_email='niklas.mahler@gmail.com', license='MIT', - packages=['propex'], + packages=['seqpoet'], zip_safe=False, test_suite='nose.collector', tests_require=['nose'], - scripts=['bin/propex'], + scripts=['bin/seqpoet'], classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Science/Research', From e00ea2851dfdbf8e2d731f93f02ef2f31c134432 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 19 Apr 2015 19:39:31 +0200 Subject: [PATCH 23/40] Reverse complement results, close #6 --- bin/seqpoet | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/bin/seqpoet b/bin/seqpoet index 5528e67..7e7e798 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -203,7 +203,7 @@ def match_primer(primers, seqs, mismatches=2, return matches -def find_operon(matches, seqs, max_distance=500): +def find_operon(matches, seqs, max_distance=500, no_revcomp=False): match_operon = [] for m in matches: gb = seqs[m['filename']] @@ -243,7 +243,9 @@ def find_operon(matches, seqs, max_distance=500): operon_seq = locus.seq[min_start:max_end] - # Reverse complement matches on minus-strand? + # Reverse complement matches on minus-strand + if not no_revcomp and m['strand'] == '-': + operon_seq = operon_seq.revcomp() match_operon.append({ 'filename': m['filename'], @@ -258,7 +260,7 @@ def find_operon(matches, seqs, max_distance=500): return match_operon -def write_fasta(matches, filename=sys.stdout): +def write_fasta(matches, filename=sys.stdout, no_revcomp=False): if isinstance(filename, file): f = filename close = False @@ -267,6 +269,8 @@ def write_fasta(matches, filename=sys.stdout): close = True for m in matches: + if not no_revcomp and m['strand'] == '-': + m['seq'] = m['seq'].revcomp() m['filename'] = os.path.basename(m['filename']) s = seqpoet.fasta.FastaRecord(m['seq'], '{filename}:{seqname}:{hitstart}:{hitend}:{length}:{strand}' \ @@ -307,6 +311,10 @@ def parse_args(): 'to consider (default: %(default)d)'), type=int, default=3000, metavar='N') + parser.add_argument('--no-revcomp', help=('don\'t reverse complement ' + 'results on the minus strand (default: do reverse complementation)'), + action='store_true') + parser.add_argument('-o', '--out', help='file for output (default: stdout)', default=sys.stdout) @@ -376,12 +384,13 @@ def main(): # In silico PCR results if is_primer and args.pcr: - write_fasta(matches, filename=args.out) + write_fasta(matches, filename=args.out, no_revcomp=args.no_revcomp) exit(0) # Operon extraction print('Looking for operons', file=sys.stderr) - match_features = find_operon(matches, seqs, max_distance=args.max_distance) + match_features = find_operon(matches, seqs, max_distance=args.max_distance, + no_revcomp=args.no_revcomp) if len(match_features) == 0: print('WARNING: no operons found', file=sys.stderr) From d7bd6efb86e81620ef211d05b7df096d6c84de69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Tue, 21 Apr 2015 19:37:00 +0200 Subject: [PATCH 24/40] Fix minimal feature bug --- seqpoet/genbank.py | 2 ++ seqpoet/tests/test_genbank.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py index 9f17a3b..7c867e5 100644 --- a/seqpoet/genbank.py +++ b/seqpoet/genbank.py @@ -339,6 +339,8 @@ def from_string(cls, locus, feature_string): """ lines = [x.strip() for x in feature_string.splitlines()] ftype, location = lines[0].strip().split() + if len(lines) == 1: + return cls(locus, ftype, parse_location(location), {}) # Multiline location string i = 1 line = lines[i] diff --git a/seqpoet/tests/test_genbank.py b/seqpoet/tests/test_genbank.py index d9f6593..6f8a46c 100644 --- a/seqpoet/tests/test_genbank.py +++ b/seqpoet/tests/test_genbank.py @@ -282,6 +282,13 @@ def test_empty_qualifiers(self): assert gbf.get_qualifier('note') is None assert gbf.get_qualifier('random') == '' + def test_minimal_feature(self): + feature = ' CDS complement(52625..53704)' + gbf = seqpoet.GenBankFeature.from_string('testlocus', feature) + + assert gbf.feature_type == 'CDS' + assert str(gbf.location) == 'complement(52625..53704)' + @raises(KeyError) def test_missing_qualifier(self): feature = ''' CDS complement(52625..53704) From db82ca0d1db4591e22da117fdd9c25cf3138c4c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Tue, 21 Apr 2015 20:50:15 +0200 Subject: [PATCH 25/40] Forgot to update name in docs --- docs/Makefile | 8 ++++---- docs/conf.py | 18 +++++++++--------- docs/index.rst | 20 ++++++++++---------- docs/installation.rst | 6 +++--- docs/make.bat | 4 ++-- docs/{propex.rst => seqpoet.rst} | 30 +++++++++++++++--------------- 6 files changed, 43 insertions(+), 43 deletions(-) rename docs/{propex.rst => seqpoet.rst} (57%) diff --git a/docs/Makefile b/docs/Makefile index 48244a5..bdae658 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -87,9 +87,9 @@ qthelp: @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/propex.qhcp" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/seqpoet.qhcp" @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/propex.qhc" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/seqpoet.qhc" applehelp: $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp @@ -104,8 +104,8 @@ devhelp: @echo @echo "Build finished." @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/propex" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/propex" + @echo "# mkdir -p $$HOME/.local/share/devhelp/seqpoet" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/seqpoet" @echo "# devhelp" epub: diff --git a/docs/conf.py b/docs/conf.py index f4e4ba7..a13fad9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # -# propex documentation build configuration file, created by -# sphinx-quickstart on Sat Mar 14 20:54:34 2015. +# seqpoet documentation build configuration file, created by +# sphinx-quickstart on Tue Apr 21 20:33:27 2015. # # This file is execfile()d with the current directory set to its # containing dir. @@ -48,7 +48,7 @@ master_doc = 'index' # General information about the project. -project = u'propex' +project = u'seqpoet' copyright = u'2015, Niklas Mähler' author = u'Niklas Mähler' @@ -59,7 +59,7 @@ # The short X.Y version. import pkg_resources try: - release = pkg_resources.get_distribution('propex').version + release = pkg_resources.get_distribution('seqpoet').version except pkg_resources.DistributionNotFound: print 'To build the documentation, The distribution information of sandman' print 'Has to be available. Either install the package into your' @@ -211,7 +211,7 @@ #html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'propexdoc' +htmlhelp_basename = 'seqpoetdoc' # -- Options for LaTeX output --------------------------------------------- @@ -233,7 +233,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'propex.tex', u'propex Documentation', + (master_doc, 'seqpoet.tex', u'seqpoet Documentation', u'Author', 'manual'), ] @@ -263,7 +263,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'propex', u'propex Documentation', + (master_doc, 'seqpoet', u'seqpoet Documentation', [author], 1) ] @@ -277,8 +277,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'propex', u'propex Documentation', - author, 'propex', 'One line description of project.', + (master_doc, 'seqpoet', u'seqpoet Documentation', + author, 'seqpoet', 'One line description of project.', 'Miscellaneous'), ] diff --git a/docs/index.rst b/docs/index.rst index d49a044..21216f4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,17 +1,17 @@ -.. propex documentation master file, created by - sphinx-quickstart on Sat Mar 14 20:54:34 2015. +.. seqpoet documentation master file, created by + sphinx-quickstart on Tue Apr 21 20:33:27 2015. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -propex: Prokaryotic Operon Extractor -==================================== +seqpoet: *In silico* PCR and operon extraction for genomic assemblies +===================================================================== -The main purpose of propex is to provide a simple interface for `in silico` -PCR and operon extraction in prokaryotes. The secondary purpose of propex is -to be a Python package that can be used for handling sequence data in the form -of FASTA and GenBank files. +The main purpose of seqpoet is to provide a simple interface for `in silico` +PCR and operon extraction for genomic assemblies. The secondary purpose of +seqpoet is to be a Python package that can be used for handling sequence +data in the form of FASTA and GenBank files. -Source code is hosted on GitHub: https://github.com/maehler/propex. +Source code is hosted on GitHub: https://github.com/maehler/seqpoet. Contents: @@ -21,7 +21,7 @@ Contents: installation insilico_pcr operon_extraction - propex + seqpoet Indices and tables diff --git a/docs/installation.rst b/docs/installation.rst index e9bf325..7a78f42 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,11 +1,11 @@ Installation ============ -Currently, the easiest way of installing propex is to clone the GitHub +Currently, the easiest way of installing seqpoet is to clone the GitHub repository and install it manually:: - > git clone https://github.com/maehler/propex.git - > cd propex + > git clone https://github.com/maehler/seqpoet.git + > cd seqpoet > python setup.py install Eventually the package will be submitted to `PyPI `_. diff --git a/docs/make.bat b/docs/make.bat index 38b06ba..abe8a02 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -127,9 +127,9 @@ if "%1" == "qthelp" ( echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\propex.qhcp + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\seqpoet.qhcp echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\propex.ghc + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\seqpoet.ghc goto end ) diff --git a/docs/propex.rst b/docs/seqpoet.rst similarity index 57% rename from docs/propex.rst rename to docs/seqpoet.rst index a4f75e1..5ac4df9 100644 --- a/docs/propex.rst +++ b/docs/seqpoet.rst @@ -1,37 +1,37 @@ -propex package documentation -============================ +seqpoet package +=============== Submodules ---------- -propex.fasta module -------------------- +seqpoet.fasta module +-------------------- -.. automodule:: propex.fasta +.. automodule:: seqpoet.fasta :members: :undoc-members: :show-inheritance: -propex.genbank module ---------------------- +seqpoet.genbank module +---------------------- -.. automodule:: propex.genbank +.. automodule:: seqpoet.genbank :members: :undoc-members: :show-inheritance: -propex.search module --------------------- +seqpoet.search module +--------------------- -.. automodule:: propex.search +.. automodule:: seqpoet.search :members: :undoc-members: :show-inheritance: -propex.sequence module ----------------------- +seqpoet.sequence module +----------------------- -.. automodule:: propex.sequence +.. automodule:: seqpoet.sequence :members: :undoc-members: :show-inheritance: @@ -40,7 +40,7 @@ propex.sequence module Module contents --------------- -.. automodule:: propex +.. automodule:: seqpoet :members: :undoc-members: :show-inheritance: From bf85818eb100a45983d606157cd4f707e99f7d0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Tue, 21 Apr 2015 21:31:16 +0200 Subject: [PATCH 26/40] Fix neighbor bug Before, the search could fall over the edge and start from the beginning, resulting in an infinite loop. --- seqpoet/genbank.py | 2 ++ seqpoet/tests/test_genbank.py | 36 ++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py index 7c867e5..9f0a6a9 100644 --- a/seqpoet/genbank.py +++ b/seqpoet/genbank.py @@ -528,6 +528,8 @@ def _neighbor(self, feature, downstream=True): findex -= 1 else: findex += 1 + if findex >= len(self.features[ftype]) or findex < 0: + return None return self.features[ftype][findex] diff --git a/seqpoet/tests/test_genbank.py b/seqpoet/tests/test_genbank.py index 6f8a46c..5b4c8d4 100644 --- a/seqpoet/tests/test_genbank.py +++ b/seqpoet/tests/test_genbank.py @@ -1,6 +1,7 @@ from nose.tools import raises from nose.plugins.skip import SkipTest import os +import tempfile import seqpoet from seqpoet.genbank import Location, JoinLocation @@ -18,7 +19,7 @@ def test_sequence_length(self): def test_mRNA(self): assert len(self.gb[0].features['mRNA']) == 3 - def test_next_downstream(self): + def test_neighbors(self): locus = self.gb[0] gbf = locus.features['mRNA'][0] assert gbf is not None @@ -27,6 +28,39 @@ def test_next_downstream(self): assert next is not None assert str(next.location) == '<687..>3158' + # Weird issue of alternating results when selecting next + # downstream + temp = tempfile.NamedTemporaryFile(delete=False) + temp.write('''LOCUS testlocus 5758 bp DNA linear 12-APR-2015 +FEATURES Location/qualifiers + source 1..5758 + CDS 7..693 + CDS 697..3303 + CDS complement(3381..4166) + CDS complement(4167..5516) + ORIGIN +//''') + temp.close() + gbfile = temp.name + + gb = seqpoet.GenBank(gbfile) + + locus = gb[0] + + gbf = locus.features_at_location(Location('4170'))[0] + assert gbf.location.is_complement + assert str(gbf.location) == 'complement(4167..5516)' + + ds = locus.next_downstream(gbf) + assert ds.location.is_complement + assert str(ds.location) == 'complement(3381..4166)' + + ds = locus.next_downstream(ds) + assert ds is None, 'should be None, found feature at {0}' \ + .format(ds.location) + + os.unlink(gbfile) + def test_header(self): header = self.gb[0].header From 3752b8fd9b01339820217a3e11f2f2977301441c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Tue, 21 Apr 2015 21:48:17 +0200 Subject: [PATCH 27/40] Fix for very short headers --- seqpoet/genbank.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py index 9f0a6a9..4650326 100644 --- a/seqpoet/genbank.py +++ b/seqpoet/genbank.py @@ -655,7 +655,10 @@ def _parse_header(self, hstring): } last_key = None - line = header_lines.next() + try: + line = header_lines.next() + except StopIteration: + return head_data while True: if line[0] != ' ': key = line[:11].strip() From e44167500a0575d9fa6a456f126a596aad4dd5c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Tue, 21 Apr 2015 21:53:24 +0200 Subject: [PATCH 28/40] Add upstream/downstream extension arguments, close #4 --- bin/seqpoet | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/bin/seqpoet b/bin/seqpoet index 7e7e798..48cc8a8 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -203,12 +203,20 @@ def match_primer(primers, seqs, mismatches=2, return matches -def find_operon(matches, seqs, max_distance=500, no_revcomp=False): +def find_operon(matches, seqs, max_distance=500, no_revcomp=False, + extend_downstream=0, extend_upstream=0): match_operon = [] for m in matches: gb = seqs[m['filename']] locus = gb[m['seqindex']] - location = seqpoet.genbank.Location.from_int(m['hitstart'], m['hitend']) + if m['strand'] == '+': + location = seqpoet.genbank.Location.from_int( + max(1, m['hitstart'] - extend_upstream), + m['hitend'] + extend_downstream) + else: + location = seqpoet.genbank.Location.from_int( + max(1, m['hitstart'] - extend_downstream), + m['hitend'] + extend_upstream) features = locus.features_at_location(location) if len(features) == 0: print('WARNING: no gene for match in locus {0}'.format(m['seqname']), @@ -315,6 +323,14 @@ def parse_args(): 'results on the minus strand (default: do reverse complementation)'), action='store_true') + parser.add_argument('--downstream', help=('extend probe/primer match ' + '%(metavar)s bases downstream for operon finding (default: ' + '%(default)s)'), metavar='N', default=0, type=int) + + parser.add_argument('--upstream', help=('extend probe/primer match ' + '%(metavar)s bases upstream for operon finding (default: ' + '%(default)s)'), metavar='N', default=0, type=int) + parser.add_argument('-o', '--out', help='file for output (default: stdout)', default=sys.stdout) @@ -338,7 +354,8 @@ def parse_args(): if not os.path.exists(os.path.dirname(args.out)): parser.error('file or directory not found: {}'.format(args.out)) - # Mismatches, distance and max/min product length should be integers >= 0 + # Mismatches, distance, max/min product length and upstream/downstream + # should be integers >= 0 if args.mismatches < 0: parser.error('mismatches must not be negative') if args.max_distance < 0: @@ -347,6 +364,10 @@ def parse_args(): parser.error('minimum product length must not be negative') if args.max_product < 0: parser.error('maximum product length must not be negative') + if args.downstream < 0: + parser.error('downstream extension must not be negative') + if args.upstream < 0: + parser.error('upstream extension must not be negative') return args @@ -390,7 +411,8 @@ def main(): # Operon extraction print('Looking for operons', file=sys.stderr) match_features = find_operon(matches, seqs, max_distance=args.max_distance, - no_revcomp=args.no_revcomp) + no_revcomp=args.no_revcomp, extend_downstream=args.downstream, + extend_upstream=args.upstream) if len(match_features) == 0: print('WARNING: no operons found', file=sys.stderr) From cc07bc29fc0408bd23b6755719221354bfa23c82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Tue, 21 Apr 2015 21:55:08 +0200 Subject: [PATCH 29/40] Ignore compiled script --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index de16f90..0125fc4 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,4 @@ /.coverage /docs/_build /data -/bin/propexc +/bin/seqpoetc From d563fb0a9cd4e3d0bce0559cd1a26ba15c5f1a73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Tue, 21 Apr 2015 21:55:32 +0200 Subject: [PATCH 30/40] Add __repr__ for GenBankFeature --- seqpoet/genbank.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py index 4650326..4620062 100644 --- a/seqpoet/genbank.py +++ b/seqpoet/genbank.py @@ -391,14 +391,17 @@ def get_qualifier(self, qualifier_name): :param qualifier_name: a string representing a qualifier. :returns: the value of the qualifier. - :raises: :py:class:`KeyError` if the feature does not have a qualifier called - ``qualifier_name``. + :raises: :py:class:`KeyError` if the feature does not have a qualifier + called ``qualifier_name``. """ if qualifier_name not in self.qualifiers: raise KeyError('{0} is not a qualifier for {1}' .format(qualifier_name, self)) return self.qualifiers[qualifier_name] + def __repr__(self): + return ''.format(self.locus, self.location) + class GenBankLocus(object): """Represent a GenBank locus. From e52e800870026b0f6a310c20b8ee35c8c8a91a2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Thu, 23 Apr 2015 18:39:57 +0200 Subject: [PATCH 31/40] Fix operon finding bug A bug in the operon extraction could in some cases result in that features from the opposite strand being returned. --- bin/seqpoet | 6 ++- seqpoet/tests/test_script.py | 95 ++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 seqpoet/tests/test_script.py diff --git a/bin/seqpoet b/bin/seqpoet index 48cc8a8..2029700 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -217,11 +217,15 @@ def find_operon(matches, seqs, max_distance=500, no_revcomp=False, location = seqpoet.genbank.Location.from_int( max(1, m['hitstart'] - extend_downstream), m['hitend'] + extend_upstream) - features = locus.features_at_location(location) + + features = filter(lambda x: x.location.is_complement == \ + (m['strand'] == '-'), locus.features_at_location(location)) + if len(features) == 0: print('WARNING: no gene for match in locus {0}'.format(m['seqname']), file=sys.stderr) continue + operon_genes = [] for f in features: # Find upstream genes diff --git a/seqpoet/tests/test_script.py b/seqpoet/tests/test_script.py new file mode 100644 index 0000000..3147ccb --- /dev/null +++ b/seqpoet/tests/test_script.py @@ -0,0 +1,95 @@ +import imp +import os + +from nose.plugins.skip import SkipTest + +import seqpoet + +currentdir = os.path.dirname(__file__) +rootdir = os.path.dirname(os.path.dirname(currentdir)) +bindir = os.path.join(rootdir, 'bin') + +seqpoet_script = imp.load_source('seqpoet_script', + os.path.join(bindir, 'seqpoet')) + +class TestFindOperon: + + def setup(self): + gb_dir = ('/Users/niklasm/Dropbox/operon_extractor/data_genbank') + gb_fname = os.path.join(gb_dir, 'LMG718-cremoris.gb') + if not os.path.exists(gb_fname): + raise SkipTest + + gb = seqpoet.genbank.GenBank(gb_fname) + self.seqs = { + gb_fname: gb + } + self.matches = [{ + 'filename': ('/Users/niklasm/Dropbox/operon_extractor/' + 'data_genbank/LMG718-cremoris.gb'), + 'hitend': 3360, + 'hitstart': 3311, + 'length': 50, + 'seq': seqpoet.sequence.Sequence( + 'aattttactgatagctttttaaaaaataaaaaaaattactgacagaaatt'), + 'seqindex': 61, + 'seqname': '718_Contig_156_c', + 'strand': '+' + }] + self.minus_matches = [{ + 'filename': '/Users/niklasm/Dropbox/operon_extractor/data_genbank/LMG718-cremoris.gb', + 'hitend': 3360, + 'hitstart': 3311, + 'length': 50, + 'seq': seqpoet.sequence.Sequence( + 'aatttctgtcagtaattttttttattttttaaaaagctatcagtaaaatt'), + 'seqindex': 61, + 'seqname': '718_Contig_156_c', + 'strand': '-' + }] + + def test_operon_find(self): + res = seqpoet_script.find_operon(self.matches, self.seqs, + max_distance=500, no_revcomp=False, extend_downstream=0, + extend_upstream=0) + assert len(res) == 0 + + def test_operon_find_extend_upstream(self): + res = seqpoet_script.find_operon(self.matches, self.seqs, + max_distance=500, no_revcomp=False, extend_downstream=0, + extend_upstream=10) + assert len(res) == 1, 'expected 1 result, got {0}'.format(len(res)) + assert len(res[0]['operon']) == 2 + + operon_len = len(res[0]['seq']) + assert operon_len == 3296, 'length is {0}'.format(operon_len) + + def test_operon_find_extend_downstream(self): + res = seqpoet_script.find_operon(self.matches, self.seqs, + max_distance=500, no_revcomp=False, extend_downstream=100, + extend_upstream=0) + assert len(res) == 0, 'expected no results, got {0}'.format(len(res)) + + def test_revcomp_operon_find(self): + res = seqpoet_script.find_operon(self.matches, self.seqs, + max_distance=500, no_revcomp=False, extend_downstream=0, + extend_upstream=0) + assert len(res) == 0 + + def test_revcomp_operon_find_extend_downstream(self): + res = seqpoet_script.find_operon(self.minus_matches, self.seqs, + max_distance=500, no_revcomp=False, extend_downstream=100, + extend_upstream=0) + assert len(res) == 0 + + def test_revcomp_operon_find_extend_upstream(self): + res = seqpoet_script.find_operon(self.minus_matches, self.seqs, + max_distance=500, no_revcomp=False, extend_downstream=0, + extend_upstream=100) + assert len(res) == 1 + assert len(res[0]['operon']) == 2 + + assert all(x.location.is_complement for x in res[0]['operon']) + + operon_len = len(res[0]['seq']) + assert operon_len == 2135, 'length is {0}'.format(operon_len) From 6201fd939967be0f2bf6ef6b4d977d2a44bca284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sat, 25 Apr 2015 18:47:12 +0200 Subject: [PATCH 32/40] No side scrolling in tables --- docs/_static/style.css | 9 +++++++++ docs/_templates/layout.html | 3 +++ 2 files changed, 12 insertions(+) create mode 100644 docs/_static/style.css create mode 100644 docs/_templates/layout.html diff --git a/docs/_static/style.css b/docs/_static/style.css new file mode 100644 index 0000000..52b598f --- /dev/null +++ b/docs/_static/style.css @@ -0,0 +1,9 @@ +.wy-table-responsive table td, .wy-table-responsive table th { + white-space: normal; +} + +.wy-table-responsive { + margin-bottom: 24px; + max-width: 100%; + overflow: visible; +} diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html new file mode 100644 index 0000000..75c902f --- /dev/null +++ b/docs/_templates/layout.html @@ -0,0 +1,3 @@ +{# layout.html #} +{% extends "!layout.html" %} +{% set css_files = css_files + ['_static/style.css'] %} From c521bef07ba4110a716334b7a747dfe7ab4f6bff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sat, 25 Apr 2015 18:49:09 +0200 Subject: [PATCH 33/40] Correct theme handling on readthedocs --- docs/conf.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index a13fad9..36c517e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -119,7 +119,16 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'sphinx_rtd_theme' + +# on_rtd is whether we are on readthedocs.org +import os +on_rtd = os.environ.get('READTHEDOCS', None) == 'True' + +if not on_rtd: # only import and set the theme if we're building docs locally + import sphinx_rtd_theme + html_theme = 'sphinx_rtd_theme' + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the From 2d1f8cc62ff5862d72b0dba54a038b97ef8a8de7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sat, 25 Apr 2015 21:30:11 +0200 Subject: [PATCH 34/40] Add command line docs In addition to the docs, I changed the meta variables for the optional arguments to be more (in my opinion) clear. --- bin/seqpoet | 14 +++++++------- docs/command_line.rst | 40 ++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 3 files changed, 48 insertions(+), 7 deletions(-) create mode 100644 docs/command_line.rst diff --git a/bin/seqpoet b/bin/seqpoet index 2029700..6e41f7b 100644 --- a/bin/seqpoet +++ b/bin/seqpoet @@ -309,19 +309,19 @@ def parse_args(): parser.add_argument('-m', '--mismatches', help=('the maximum number of ' 'mismatches allowed when aligning probe/primer to the genome ' '(default: %(default)d)'), - type=int, default=2, metavar='N') + type=int, default=2, metavar='int') parser.add_argument('-d', '--max-distance', help=('the maximum intergenic ' 'distance allowed when assembling operons (default: %(default)d)'), - type=int, default=500, metavar='N') + type=int, default=500, metavar='int') parser.add_argument('--min-product', help=('minimum PCR product length ' 'to consider (default: %(default)d)'), type=int, default=0, - metavar='N') + metavar='int') parser.add_argument('--max-product', help=('maximum PCR product length ' 'to consider (default: %(default)d)'), type=int, default=3000, - metavar='N') + metavar='int') parser.add_argument('--no-revcomp', help=('don\'t reverse complement ' 'results on the minus strand (default: do reverse complementation)'), @@ -329,14 +329,14 @@ def parse_args(): parser.add_argument('--downstream', help=('extend probe/primer match ' '%(metavar)s bases downstream for operon finding (default: ' - '%(default)s)'), metavar='N', default=0, type=int) + '%(default)s)'), metavar='int', default=0, type=int) parser.add_argument('--upstream', help=('extend probe/primer match ' '%(metavar)s bases upstream for operon finding (default: ' - '%(default)s)'), metavar='N', default=0, type=int) + '%(default)s)'), metavar='int', default=0, type=int) parser.add_argument('-o', '--out', help='file for output (default: stdout)', - default=sys.stdout) + default=sys.stdout, metavar='file') parser.add_argument('--version', help=('print version and exit'), action='version', version='%(prog)s v{0}'.format(seqpoet.__version__)) diff --git a/docs/command_line.rst b/docs/command_line.rst new file mode 100644 index 0000000..e18f67a --- /dev/null +++ b/docs/command_line.rst @@ -0,0 +1,40 @@ +Command line arguments +====================== + +:: + + seqpoet [options] genomedir probe + +Mandatory arguments +------------------- + +============= ======================================================= +genomedir directory containing the genome files to use (FASTA or + GenBank format) or a single GenBank or FASTA file +probe file containing either a single sequence (probe) or a + pair of sequences (primer pair; one sequence per line) +============= ======================================================= + +Optional arguments +------------------ + +-h, --help show this help message and exit +--pcr only perform in silico PCR. Requires that the probe + file contains a primer pair (default: perform operon + extraction) +-m int, --mismatches int + the maximum number of mismatches allowed when aligning + probe/primer to the genome (default: 2) +-d int, --max-distance int + the maximum intergenic distance allowed when + assembling operons (default: 500) +--min-product int minimum PCR product length to consider (default: 0) +--max-product int maximum PCR product length to consider (default: 3000) +--no-revcomp don't reverse complement results on the minus strand + (default: do reverse complementation) +--downstream int extend probe/primer match int bases downstream for + operon finding (default: 0) +--upstream int extend probe/primer match int bases upstream for + operon finding (default: 0) +-o file, --out file file for output (default: stdout) +--version print version and exit diff --git a/docs/index.rst b/docs/index.rst index 21216f4..5b3d04a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,6 +19,7 @@ Contents: :maxdepth: 4 installation + command_line insilico_pcr operon_extraction seqpoet From e076d96fd11d74c9e515881fc5e3a3d0ffdd67a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sat, 25 Apr 2015 21:54:40 +0200 Subject: [PATCH 35/40] Add quick start page --- docs/index.rst | 1 + docs/quickstart.rst | 58 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 docs/quickstart.rst diff --git a/docs/index.rst b/docs/index.rst index 5b3d04a..cf7c54a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,6 +19,7 @@ Contents: :maxdepth: 4 installation + quickstart command_line insilico_pcr operon_extraction diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 0000000..f03523c --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,58 @@ +Quick start +=========== + +Operon extraction +----------------- + +.. code-block:: bash + + seqpoet --out output.fa input_directory probe.txt + seqpoet --out output.fa input_directory primers.txt + seqpoet --out output.fa input.gb probe.txt + seqpoet --out output.fa input.gb primers.txt + +The file ``input.gb`` should be a valid GenBank file (possibly with +multiple loci) and ``probe.txt`` should contain either a single nucleotide +sequence (*i.e.* a probe) or two nucleotide sequences (*i.e.* a primer pair). +Instead of supplying a single file, a directory of sequence files can be used +as the first argument. + +Annotations are needed for the operon extraction, and currently GenBank +is the only supported format for this. The FASTA file ``output.fa`` will +contain the extracted sequences. If the ``--out`` argument is not supplied, +the results are written to stdout. + +*In silico* PCR +--------------- + +.. code-block:: bash + + seqpoet --pcr --out output.fa input.gb primers.txt + seqpoet --pcr --out output.fa input.fa primers.txt + + +For *in silico* PCR, only primer pairs are supported, but the sequence input +can be either FASTA or GenBank. The FASTA file ``output.fa`` will contain the +predicted PCR products. If the ``--out`` argument is not supplied, +the results are written to stdout. + +Output +------ + +The output from both operon extraction and *in silico* PCR will be a FASTA +file. The header line for each result sequence is a colon separated string +and will look something like this: + +:: + + >input.gb:locus:3451:3812:28:+ + +- ``input.gb``: the original file where the sequence originates from +- ``locus``: the name of the sequence in the original file, either from a FASTA + header or a GenBank locus name +- ``3451``: the position in the original sequence of the first nucleotide in + the result sequence +- ``3812``: the position in the original sequence of the last nucleotide in the + result sequence +- ``28``: the length of the original match +- ``+``: the sequence was found on the plus strand (otherwise ``-``) From 3483110370c41203dbb15bb29396b5ba83f41b47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sat, 25 Apr 2015 22:09:32 +0200 Subject: [PATCH 36/40] Use quick start for examples --- docs/index.rst | 2 -- docs/insilico_pcr.rst | 2 -- docs/operon_extraction.rst | 2 -- 3 files changed, 6 deletions(-) delete mode 100644 docs/insilico_pcr.rst delete mode 100644 docs/operon_extraction.rst diff --git a/docs/index.rst b/docs/index.rst index cf7c54a..3b65f63 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -21,8 +21,6 @@ Contents: installation quickstart command_line - insilico_pcr - operon_extraction seqpoet diff --git a/docs/insilico_pcr.rst b/docs/insilico_pcr.rst deleted file mode 100644 index 7626b11..0000000 --- a/docs/insilico_pcr.rst +++ /dev/null @@ -1,2 +0,0 @@ -`In silico` PCR -=============== diff --git a/docs/operon_extraction.rst b/docs/operon_extraction.rst deleted file mode 100644 index ebf4371..0000000 --- a/docs/operon_extraction.rst +++ /dev/null @@ -1,2 +0,0 @@ -Operon extraction -================= From eb2b8b159076c7d88e4aca5cbbbfbac59a8da3c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sat, 25 Apr 2015 22:13:39 +0200 Subject: [PATCH 37/40] Add requirements --- docs/installation.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index 7a78f42..17d59b6 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,5 +1,7 @@ -Installation -============ +Installation and requirements +============================= + +The only requirement is Python 2.7. Unfortunately Python 3 is not supported. Currently, the easiest way of installing seqpoet is to clone the GitHub repository and install it manually:: @@ -8,4 +10,5 @@ repository and install it manually:: > cd seqpoet > python setup.py install -Eventually the package will be submitted to `PyPI `_. +Eventually the package will be submitted to +`PyPI `_. From 16c92a20dbb206cbf17f2f053c31d9b55edb32bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 26 Apr 2015 10:20:21 +0200 Subject: [PATCH 38/40] Doc formatting --- seqpoet/fasta.py | 14 +++++++++----- seqpoet/genbank.py | 34 +++++++++++++++++++--------------- seqpoet/sequence.py | 4 ++++ 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/seqpoet/fasta.py b/seqpoet/fasta.py index eb0d50e..500d9fa 100644 --- a/seqpoet/fasta.py +++ b/seqpoet/fasta.py @@ -1,5 +1,9 @@ +#-*- encoding: utf-8 -*- """The FASTA module contains classes for accessing FASTA files and FASTA index files. + +.. module:: fasta +.. moduleauthor:: Niklas Mähler """ import collections @@ -34,11 +38,11 @@ def parse_index(self): an OrderedDict with sequence names (headers) as keys and dicts as values. The value dicts have the following members: - name: the sequence name (FASTA header line) - length: sequence length - offset: the byte offset of the first base of the sequence - nbase: number of bases per line of sequence - linelen: number of bytes per line of sequence + - name: the sequence name (FASTA header line) + - length: sequence length + - offset: the byte offset of the first base of the sequence + - nbase: number of bases per line of sequence + - linelen: number of bytes per line of sequence :raises: ValueError if the file cannot be parsed, if the file contains duplicated headers or if the file is empty. diff --git a/seqpoet/genbank.py b/seqpoet/genbank.py index 4620062..d3108b1 100644 --- a/seqpoet/genbank.py +++ b/seqpoet/genbank.py @@ -154,11 +154,12 @@ class Location(object): of the bases between two positions. **Class attributes:** - * **locstring:** the string representation of the location. - * **loctype:** the type of the location. - * **start:** the start position (0-based, including). - * **end:** the end position (0-based, including). - * **is_complement:** boolean indicating whether the position represents + + - **locstring:** the string representation of the location. + - **loctype:** the type of the location. + - **start:** the start position (0-based, including). + - **end:** the end position (0-based, including). + - **is_complement:** boolean indicating whether the position represents the complement of the sequence. :param locstring: a GenBank location string. @@ -293,10 +294,11 @@ class GenBankFeature(object): """Represent a GenBank feature. **Class attributes:** - * **feature_type**: a string with the feature key. - * **location**: a Location object representing the location of - the feature. - * **qualifiers**: a dictionary of qualifiers of the feature. + + - **feature_type**: a string with the feature key. + - **location**: a Location object representing the location of + the feature. + - **qualifiers**: a dictionary of qualifiers of the feature. :param locus: the name of the locus that the feature belongs to. :param feature_type: name of the feature. @@ -407,9 +409,10 @@ class GenBankLocus(object): """Represent a GenBank locus. **Class attributes:** - * **name:** locus name. - * **seq:** a Sequence object with the sequence of the locus. - * **features:** a dictionary containing the features of the locus. + + - **name:** locus name. + - **seq:** a Sequence object with the sequence of the locus. + - **features:** a dictionary containing the features of the locus. :param name: the name of the locus. :param seq: a Sequence object representing the sequence of the locus. @@ -543,9 +546,10 @@ class GenBank(object): """Represent a GenBank file. - Class attributes: - * filename: the filename of the GenBank file. - * index: a list of dictionaries representing an index of the file. + **Class attributes:** + + - filename: the filename of the GenBank file. + - index: a list of dictionaries representing an index of the file. :param fname: filename of the GenBank file. :raises: :py:exc:`.ParsingError` if parsing fails. diff --git a/seqpoet/sequence.py b/seqpoet/sequence.py index ad43b85..fc62217 100644 --- a/seqpoet/sequence.py +++ b/seqpoet/sequence.py @@ -1,4 +1,8 @@ +#-*- encoding: utf-8 -*- """Classes and functions for representing DNA sequences. + +.. module:: sequence +.. moduleauthor:: Niklas Mähler """ import re From 7e7d36f9337fc662de293f66fc6b0fdad6018b93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 26 Apr 2015 10:38:10 +0200 Subject: [PATCH 39/40] Add link to quickstart, close #8 --- docs/index.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/index.rst b/docs/index.rst index 3b65f63..ba90982 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,6 +11,8 @@ PCR and operon extraction for genomic assemblies. The secondary purpose of seqpoet is to be a Python package that can be used for handling sequence data in the form of FASTA and GenBank files. +To get started quickly, take a look at the guide in `Quick start `_. + Source code is hosted on GitHub: https://github.com/maehler/seqpoet. Contents: From 78ee5416df9e481629748f2379b05f926409c4a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Ma=CC=88hler?= Date: Sun, 26 Apr 2015 10:40:59 +0200 Subject: [PATCH 40/40] Version bump --- seqpoet/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqpoet/__init__.py b/seqpoet/__init__.py index 7b4ce77..1ead78f 100644 --- a/seqpoet/__init__.py +++ b/seqpoet/__init__.py @@ -3,4 +3,4 @@ from sequence import Sequence import search -__version__ = '0.2.0' +__version__ = '0.3.0'