From cf971b4834ac88b660dc801be7ca22b39fd07761 Mon Sep 17 00:00:00 2001 From: Nathan Dunn Date: Thu, 11 Jun 2020 08:08:37 -0700 Subject: [PATCH] more changes --- apollo/annotations/__init__.py | 18 ++-- test-data/gene-top.gff | 16 ++-- test/annotations_test.py | 59 +++++++++++- test/io_test.py | 160 ++++++++++++++++++++------------- 4 files changed, 176 insertions(+), 77 deletions(-) diff --git a/apollo/annotations/__init__.py b/apollo/annotations/__init__.py index a8346812..22fdb5d4 100644 --- a/apollo/annotations/__init__.py +++ b/apollo/annotations/__init__.py @@ -1261,17 +1261,23 @@ def _get_subfeature_type(self, rec): def _process_gff_entry(self, rec, new_feature_list, new_transcript_list, source=None, disable_cds_recalculation=False, use_name=False, verbose=False): type = self._get_type(rec) + print("type " + str(type)) subfeatures = self._get_subfeatures(rec) if type in util.gene_types: + print("is gene type") if subfeatures is not None and len(subfeatures) > 0: - feature_data = util._yieldApolloData(rec.features[1:], use_name=use_name, + print("has sub features") + feature_data = util._yieldApolloData(subfeatures, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation) + print("output feature data" + str(feature_data)) new_transcript_list.append(feature_data) else: + print("NO sub features, just adding directly") feature_data = util._yieldApolloData(rec.features, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation) + print("output feature data" + str(feature_data)) new_feature_list.append(feature_data) - if type in util.pseudogenes_types: + elif type in util.pseudogenes_types: if subfeatures is not None and len(subfeatures) > 0: feature_data = util._yieldApolloData(rec.features[1:], use_name=use_name, disable_cds_recalculation=disable_cds_recalculation) @@ -1280,11 +1286,11 @@ def _process_gff_entry(self, rec, new_feature_list, new_transcript_list, source= feature_data = util._yieldApolloData(rec.features, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation) new_feature_list.append(feature_data) - if type in util.coding_transcript_types or type in util.noncoding_transcript_types: + elif type in util.coding_transcript_types or type in util.noncoding_transcript_types: feature_data = util._yieldApolloData(rec.features, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation) new_transcript_list.append(feature_data) - if type in util.single_level_feature_types: + elif type in util.single_level_feature_types: feature_data = util._yieldApolloData(rec.features, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation) new_feature_list.append(feature_data) @@ -1349,7 +1355,7 @@ def _process_gff_entry(self, rec, new_feature_list, new_transcript_list, source= # # a gene or a transcript # return_object = {} - return_object['features'] = feature_data + return_object['features'] = [feature_data] return return_object def load_gff3(self, organism, gff3, source=None, batch_size=1, @@ -1498,6 +1504,8 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1, # sys.stdout.flush() sys.stdout.flush() + print("features to write" + new_features_list) + print("transcripts to write" + new_transcripts_list) self._write_features(new_features_list, test, verbose, timing, FeatureType.FEATURE) self._write_features(new_transcripts_list, test, verbose, timing, FeatureType.TRANSCRIPT) sys.stdout.write("\nfinished loading\n") diff --git a/test-data/gene-top.gff b/test-data/gene-top.gff index 44a8a50c..e991df12 100644 --- a/test-data/gene-top.gff +++ b/test-data/gene-top.gff @@ -1,6 +1,10 @@ -##gff-version 3 -##sequence-region Merlin 1 172788 -Merlin GeneMark.hmm gene 2 691 -856.563659 + . ID=Merlin_1;seqid=Merlin -Merlin GeneMark.hmm mRNA 2 691 . + . ID=Merlin_1_mRNA;Parent=Merlin_1;seqid=Merlin;color=#00ff00 -Merlin GeneMark.hmm exon 2 691 . + . ID=Merlin_1_exon;Parent=Merlin_1_mRNA;seqid=Merlin -Merlin GeneMark.hmm CDS 2 691 . + 0 ID=Merlin_1_CDS;Parent=Merlin_1_exon;seqid=Merlin +##gff-version 3 +##sequence-region Merlin 1 172788 +ctg123 example gene 1050 9000 . + . ID=EDEN;Name=EDEN;Note=protein kinase +ctg123 example mRNA 1050 9000 . + . ID=EDEN.1;Parent=EDEN;Name=EDEN.1;Index=1 +ctg123 example five_prime_UTR 1050 1200 . + . Parent=EDEN.1 +ctg123 example CDS 1201 1500 . + 0 Parent=EDEN.1 +ctg123 example CDS 3000 3902 . + 0 Parent=EDEN.1 +ctg123 example CDS 5000 5500 . + 0 Parent=EDEN.1 +ctg123 example CDS 7000 7608 . + 0 Parent=EDEN.1 +ctg123 example three_prime_UTR 7609 9000 . + . Parent=EDEN.1 diff --git a/test/annotations_test.py b/test/annotations_test.py index 8193d325..e1c7aa62 100644 --- a/test/annotations_test.py +++ b/test/annotations_test.py @@ -1,12 +1,23 @@ from BCBio import GFF +from BCBio.GFF import GFFExaminer +# from gffutils import inspect + from . import ApolloTestCase, wa from apollo import util +def parse(path): + in_handle = open(path) + for rec in GFF.parse(in_handle): + yield rec + class AnnotationsTest(ApolloTestCase): - def test_features_to_apollo_schema(self): + def test_inclusion(self): + assert ("gene" in util.gene_types) + + def test_features_to_apollo_schema_mrna(self): path = 'test-data/mrna-top.gff' with open(path) as file: print(file.read()) @@ -18,7 +29,51 @@ def test_features_to_apollo_schema(self): feature_data = util._yieldApolloData(rec.features) in_handle.close() - # print(str(len(feature_data))) + print(str(feature_data)) + assert (feature_data['location'] is not None) + assert (len(feature_data['children']) == 2) + + def test_features_to_apollo_schema_gene(self): + path = 'test-data/gene-top.gff' + print("inspecting") + output = parse(path) + print(str(output)) + for o in output: + print("AAA") + print(str(o)) + print("BBB") + print("inspected") + + with open(path) as file: + print(file.read()) + file.close() + in_handle = open(path) + feature_data = None + examiner = GFFExaminer() + print(examiner.parent_child_map(in_handle)) + in_handle.close() + in_handle = open(path) + new_feature_list = [] + new_transcript_list = [] + for rec in GFF.parse(in_handle): + print(str(rec)) + for f in rec.features: + print("feature ===== start") + print(f) + print("feature ===== end") + feature_data = wa.annotations._process_gff_entry(rec, new_feature_list=new_feature_list, + new_transcript_list=new_transcript_list) + print("feature list " + str(new_feature_list)) + print("transcript list " + str(new_transcript_list)) + print("feature data" + str(feature_data)) + # assert (subfeatures is not None and len(subfeatures) > 0) + # # feature_data = util.features_to_apollo_schema(rec.features, feature_list, transcript_list) + # feature_data = util._yieldApolloData(rec.features) + + in_handle.close() + print(str(feature_data)) + print("final feature list " + str(new_feature_list)) + print("final transcript list " + str(new_transcript_list)) assert (feature_data['location'] is not None) assert (len(feature_data['children']) == 2) diff --git a/test/io_test.py b/test/io_test.py index 581aeb11..e85a7f76 100644 --- a/test/io_test.py +++ b/test/io_test.py @@ -1,6 +1,7 @@ import re from . import ApolloTestCase, wa +import time class IoTest(ApolloTestCase): @@ -24,67 +25,98 @@ def test_export_gff3(self): assert 'Merlin\t.\tnon_canonical_three_prime_splice_site\t4297\t4297\t.\t-\t.' in gff_content assert 'Merlin\t.\tnon_canonical_five_prime_splice_site\t4364\t4364\t.\t-\t.' in gff_content - def test_export_vcf(self): - - org = wa.organisms.show_organism('test_organism') - - uuid_vcf = wa.io.write_downloadable(org['commonName'], 'VCF') - if 'error' in uuid_vcf or 'uuid' not in uuid_vcf: - raise Exception("Apollo failed to prepare the VCF file for download: %s" % uuid_vcf) - - vcf_content = wa.io.download(uuid_vcf['uuid'], output_format="text") - assert '##fileformat=VCFv4.2' in vcf_content - assert '##fileDate=20200608' in vcf_content - assert '##source=.' in vcf_content - assert '#CHROM POS ID REF ALT QUAL FILTER INFO' in vcf_content - - def test_export_fa_cds(self): - - org = wa.organisms.show_organism('test_organism') - - uuid_fa = wa.io.write_downloadable(org['commonName'], 'FASTA', seq_type='cds') - if 'error' in uuid_fa or 'uuid' not in uuid_fa: - raise Exception("Apollo failed to prepare the cds FASTA file for download: %s" % uuid_fa) - - fa_content = wa.io.download(uuid_fa['uuid'], output_format="text") - assert 'CGTTTAGACAAAGGTACATTATTGTATCGTGGCCAAAAATTAGACCTTCCTACATTCGAG' in fa_content - assert 'CACCTCAATTATCACTGCCGGTACTCAACAGCTGGTAAGAAAGTCTGGTGTATCGAAATA' in fa_content - assert 'ATGAGCATTAAAGTCAGAGAATTAGATGATAAGACTGATGCTTTAATTAGCGGAGTTAAA' in fa_content - assert 'ATGAAAAGCGAAAACATGTCCACAATGAGACGTCGTAAAGTTATCGCTGATTCAAAGGGT' in fa_content - assert '(mRNA) 690 residues [Merlin:2-691 + strand] [cds]' in fa_content - assert '(mRNA) 108 residues [Merlin:1067-2011 - strand] [cds]' in fa_content - assert '(mRNA) 1662 residues [Merlin:3066-4796 - strand] [cds]' in fa_content - - def test_export_fa_cdna(self): - - org = wa.organisms.show_organism('test_organism') - - uuid_fa = wa.io.write_downloadable(org['commonName'], 'FASTA', seq_type='cdna') - if 'error' in uuid_fa or 'uuid' not in uuid_fa: - raise Exception("Apollo failed to prepare the cdna FASTA file for download: %s" % uuid_fa) - - fa_content = wa.io.download(uuid_fa['uuid'], output_format="text") - assert 'CGTTTAGACAAAGGTACATTATTGTATCGTGGCCAAAAATTAGACCTTCCTACATTCGAG' in fa_content - assert 'ATGAAATCAATTTTTCGTATCAACGGTGTAGAAATTGTAGTTGAAGATGTAGTTCCTATG' in fa_content - assert 'ATGCTAACTTTAGATGAATTTAAAAACCAAGCGGGTAATATAGACTTTCAGCGTACTAAT' in fa_content - assert 'ATGAGCATTAAAGTCAGAGAATTAGATGATAAGACTGATGCTTTAATTAGCGGAGTTAAA' in fa_content - assert '(mRNA) 690 residues [Merlin:2-691 + strand] [cdna]' in fa_content - assert '(mRNA) 945 residues [Merlin:1067-2011 - strand] [cdna]' in fa_content - assert '(mRNA) 1662 residues [Merlin:3066-4796 - strand] [cdna]' in fa_content - - def test_export_fa_peptide(self): - - org = wa.organisms.show_organism('test_organism') - - uuid_fa = wa.io.write_downloadable(org['commonName'], 'FASTA', seq_type='peptide') - if 'error' in uuid_fa or 'uuid' not in uuid_fa: - raise Exception("Apollo failed to prepare the peptide FASTA file for download: %s" % uuid_fa) - - fa_content = wa.io.download(uuid_fa['uuid'], output_format="text") - assert 'RLDKGTLLYRGQKLDLPTFEHNAENKLFYFRNYVSTSLKPLIFGEFGRMFMALDDDTTIY' in fa_content - assert 'HLNYHCRYSTAGKKVWCIEISYWSNEQSCCSVFIR' in fa_content - assert 'MSIKVRELDDKTDALISGVKTSAGQSSQSAKIKSTITAQYPSERSAGNDTSGSLRVHDLY' in fa_content - assert 'MKSENMSTMRRRKVIADSKGERDAASTASDQVDSLELIGLKLDDVQSANELVAEVIEEKG' in fa_content - assert '(mRNA) 229 residues [Merlin:2-691 + strand] [peptide]' in fa_content - assert '(mRNA) 35 residues [Merlin:1067-2011 - strand] [peptide]' in fa_content - assert '(mRNA) 553 residues [Merlin:3066-4796 - strand] [peptide]' in fa_content + # def test_export_vcf(self): + # + # org = wa.organisms.show_organism('test_organism') + # + # uuid_vcf = wa.io.write_downloadable(org['commonName'], 'VCF') + # if 'error' in uuid_vcf or 'uuid' not in uuid_vcf: + # raise Exception("Apollo failed to prepare the VCF file for download: %s" % uuid_vcf) + # + # vcf_content = wa.io.download(uuid_vcf['uuid'], output_format="text") + # assert '##fileformat=VCFv4.2' in vcf_content + # assert '##fileDate=20200608' in vcf_content + # assert '##source=.' in vcf_content + # assert '#CHROM POS ID REF ALT QUAL FILTER INFO' in vcf_content + # + # def test_export_fa_cds(self): + # + # org = wa.organisms.show_organism('test_organism') + # + # uuid_fa = wa.io.write_downloadable(org['commonName'], 'FASTA', seq_type='cds') + # if 'error' in uuid_fa or 'uuid' not in uuid_fa: + # raise Exception("Apollo failed to prepare the cds FASTA file for download: %s" % uuid_fa) + # + # fa_content = wa.io.download(uuid_fa['uuid'], output_format="text") + # assert 'CGTTTAGACAAAGGTACATTATTGTATCGTGGCCAAAAATTAGACCTTCCTACATTCGAG' in fa_content + # assert 'CACCTCAATTATCACTGCCGGTACTCAACAGCTGGTAAGAAAGTCTGGTGTATCGAAATA' in fa_content + # assert 'ATGAGCATTAAAGTCAGAGAATTAGATGATAAGACTGATGCTTTAATTAGCGGAGTTAAA' in fa_content + # assert 'ATGAAAAGCGAAAACATGTCCACAATGAGACGTCGTAAAGTTATCGCTGATTCAAAGGGT' in fa_content + # assert '(mRNA) 690 residues [Merlin:2-691 + strand] [cds]' in fa_content + # assert '(mRNA) 108 residues [Merlin:1067-2011 - strand] [cds]' in fa_content + # assert '(mRNA) 1662 residues [Merlin:3066-4796 - strand] [cds]' in fa_content + # + # def test_export_fa_cdna(self): + # + # org = wa.organisms.show_organism('test_organism') + # + # uuid_fa = wa.io.write_downloadable(org['commonName'], 'FASTA', seq_type='cdna') + # if 'error' in uuid_fa or 'uuid' not in uuid_fa: + # raise Exception("Apollo failed to prepare the cdna FASTA file for download: %s" % uuid_fa) + # + # fa_content = wa.io.download(uuid_fa['uuid'], output_format="text") + # assert 'CGTTTAGACAAAGGTACATTATTGTATCGTGGCCAAAAATTAGACCTTCCTACATTCGAG' in fa_content + # assert 'ATGAAATCAATTTTTCGTATCAACGGTGTAGAAATTGTAGTTGAAGATGTAGTTCCTATG' in fa_content + # assert 'ATGCTAACTTTAGATGAATTTAAAAACCAAGCGGGTAATATAGACTTTCAGCGTACTAAT' in fa_content + # assert 'ATGAGCATTAAAGTCAGAGAATTAGATGATAAGACTGATGCTTTAATTAGCGGAGTTAAA' in fa_content + # assert '(mRNA) 690 residues [Merlin:2-691 + strand] [cdna]' in fa_content + # assert '(mRNA) 945 residues [Merlin:1067-2011 - strand] [cdna]' in fa_content + # assert '(mRNA) 1662 residues [Merlin:3066-4796 - strand] [cdna]' in fa_content + # + # def test_export_fa_peptide(self): + # + # org = wa.organisms.show_organism('test_organism') + # + # uuid_fa = wa.io.write_downloadable(org['commonName'], 'FASTA', seq_type='peptide') + # if 'error' in uuid_fa or 'uuid' not in uuid_fa: + # raise Exception("Apollo failed to prepare the peptide FASTA file for download: %s" % uuid_fa) + # + # fa_content = wa.io.download(uuid_fa['uuid'], output_format="text") + # assert 'RLDKGTLLYRGQKLDLPTFEHNAENKLFYFRNYVSTSLKPLIFGEFGRMFMALDDDTTIY' in fa_content + # assert 'HLNYHCRYSTAGKKVWCIEISYWSNEQSCCSVFIR' in fa_content + # assert 'MSIKVRELDDKTDALISGVKTSAGQSSQSAKIKSTITAQYPSERSAGNDTSGSLRVHDLY' in fa_content + # assert 'MKSENMSTMRRRKVIADSKGERDAASTASDQVDSLELIGLKLDDVQSANELVAEVIEEKG' in fa_content + # assert '(mRNA) 229 residues [Merlin:2-691 + strand] [peptide]' in fa_content + # assert '(mRNA) 35 residues [Merlin:1067-2011 - strand] [peptide]' in fa_content + # assert '(mRNA) 553 residues [Merlin:3066-4796 - strand] [peptide]' in fa_content + + def setUp(self): + # Make sure the organism is not already there + temp_org_info = wa.organisms.show_organism('temp_org') + if 'directory' in temp_org_info: + wa.organisms.delete_organism(temp_org_info['id']) + self.waitOrgDeleted('temp_org') + + org_info = wa.organisms.show_organism('alt_org') + if 'directory' not in org_info: + # Should not happen, but let's be tolerant... + # Error received when it fails: {'error': 'No row with the given identifier exists: [org.bbop.apollo.Organism#1154]'} + time.sleep(1) + org_info = wa.organisms.show_organism('alt_org') + + wa.organisms.add_organism('temp_org', org_info['directory']) + self.waitOrgCreated('temp_org') + + def tearDown(self): + org_info = wa.organisms.show_organism('temp_org') + + if org_info and 'id' in org_info: + wa.organisms.delete_organism(org_info['id']) + + self.waitOrgDeleted('temp_org') + + org_info = wa.organisms.show_organism('some_new_org') + + if org_info and 'id' in org_info: + wa.organisms.delete_organism(org_info['id']) + self.waitOrgDeleted('some_new_org')