From 23223ad99e8df3cff3252c3699f292469121eb54 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Fri, 26 Jan 2024 15:24:02 +0100 Subject: [PATCH 1/8] Explore dropping CDSs --- apollo/util.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/apollo/util.py b/apollo/util.py index 802625b..11529cd 100644 --- a/apollo/util.py +++ b/apollo/util.py @@ -121,6 +121,28 @@ def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False): # # TODO: handle description # # TODO: handle GO, Gene Product, Provenance + def __floc(location): + return f"{location['fmin']}-{location['fmax']}-{location['strand']}" + + for child1 in current['children']: + exon_regions = [] + for child1 in current['children']: + for child in child1['children']: + print(child) + if child['type']['name'] == 'exon': + exon_regions.append(__floc(child['location'])) + new_current_children = [] + for child in child1['children']: + if child['type']['name'] == 'CDS': + continue + nnn = __floc(child['location']) + if nnn not in exon_regions: + new_current_children.append(child) + else: + new_current_children.append(child) + child1['children'] = new_current_children + print(exon_regions) + if 'children' in current and gene.type == 'gene': # Only sending mRNA level as apollo is more comfortable with orphan mRNAs return current['children'] From bc4bfd1f33f66ff89fa466bcc48985352f4c8f5e Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Wed, 31 Jan 2024 10:47:55 +0100 Subject: [PATCH 2/8] rearrange --- apollo/util.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/apollo/util.py b/apollo/util.py index 11529cd..52cf022 100644 --- a/apollo/util.py +++ b/apollo/util.py @@ -121,30 +121,27 @@ def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False): # # TODO: handle description # # TODO: handle GO, Gene Product, Provenance - def __floc(location): - return f"{location['fmin']}-{location['fmax']}-{location['strand']}" - - for child1 in current['children']: - exon_regions = [] + if 'children' in current and gene.type == 'gene': + # Only sending mRNA level as apollo is more comfortable with orphan mRNAs for child1 in current['children']: + exon_regions = [] + for child1 in current['children']: + for child in child1['children']: + print(child) + if child['type']['name'] == 'exon': + exon_regions.append(__floc(child['location'])) + new_current_children = [] for child in child1['children']: - print(child) - if child['type']['name'] == 'exon': - exon_regions.append(__floc(child['location'])) - new_current_children = [] - for child in child1['children']: - if child['type']['name'] == 'CDS': - continue - nnn = __floc(child['location']) - if nnn not in exon_regions: + if child['type']['name'] == 'CDS': + continue + nnn = __floc(child['location']) + if nnn not in exon_regions: + new_current_children.append(child) + else: new_current_children.append(child) - else: - new_current_children.append(child) - child1['children'] = new_current_children - print(exon_regions) + child1['children'] = new_current_children + print(exon_regions) - if 'children' in current and gene.type == 'gene': - # Only sending mRNA level as apollo is more comfortable with orphan mRNAs return current['children'] else: # No children, return a generic gene feature From da4a1f22eccb0c9743039f9ca29b81095b3eca3b Mon Sep 17 00:00:00 2001 From: Garrett Stevens Date: Wed, 31 Jan 2024 10:49:05 +0100 Subject: [PATCH 3/8] Rewrite patch for more safety --- apollo/util.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/apollo/util.py b/apollo/util.py index 52cf022..3a3adcb 100644 --- a/apollo/util.py +++ b/apollo/util.py @@ -123,24 +123,25 @@ def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False): if 'children' in current and gene.type == 'gene': # Only sending mRNA level as apollo is more comfortable with orphan mRNAs - for child1 in current['children']: - exon_regions = [] - for child1 in current['children']: - for child in child1['children']: - print(child) - if child['type']['name'] == 'exon': - exon_regions.append(__floc(child['location'])) - new_current_children = [] - for child in child1['children']: - if child['type']['name'] == 'CDS': - continue - nnn = __floc(child['location']) - if nnn not in exon_regions: - new_current_children.append(child) + for mRNA in current['children']: + new_mRNA_children = [] + new_cds = None + for feature in mRNA['children']: + if feature['type']['name'] == 'CDS': + if new_cds: + new_cds_start = new_cds['location']['fmin'] + new_cds_end = new_cds['location']['fmax'] + this_cds_start = feature['location']['fmin'] + this_cds_end = feature['location']['fmax'] + new_cds['location']['fmin'] = min(new_cds_start, this_cds_start) + new_cds['location']['fmax'] = max(new_cds_end, this_cds_end) + else: + new_cds = feature else: - new_current_children.append(child) - child1['children'] = new_current_children - print(exon_regions) + new_mRNA_children.append(feature) + if new_cds: + mRNA['children'] = new_mRNA_children + mRNA['children'].append(new_cds) return current['children'] else: From 18ee938342b6031523ede61f13f81aed7e473ae7 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Wed, 31 Jan 2024 10:50:03 +0100 Subject: [PATCH 4/8] feature flag it --- apollo/util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apollo/util.py b/apollo/util.py index 3a3adcb..f1b5c5a 100644 --- a/apollo/util.py +++ b/apollo/util.py @@ -99,7 +99,7 @@ def _tnType(feature): return 'exon' -def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False): +def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False, cds_cleaning=False): current = _yieldSubFeatureData(gene, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name) if gene.sub_features: @@ -122,6 +122,9 @@ def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False): # # TODO: handle GO, Gene Product, Provenance if 'children' in current and gene.type == 'gene': + if not cds_cleaning: + return current['children'] + # Only sending mRNA level as apollo is more comfortable with orphan mRNAs for mRNA in current['children']: new_mRNA_children = [] From fcd4ff92d24d95d222aafc19da25db7d31a31bb9 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Wed, 31 Jan 2024 10:56:28 +0100 Subject: [PATCH 5/8] Propagate new cds_cleaning parameter --- apollo/annotations/__init__.py | 25 ++++++++++++------------- apollo/util.py | 32 ++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/apollo/annotations/__init__.py b/apollo/annotations/__init__.py index 10163b9..2bc6611 100644 --- a/apollo/annotations/__init__.py +++ b/apollo/annotations/__init__.py @@ -1247,10 +1247,15 @@ def _get_type(self, rec): def _get_subfeature_type(self, rec): return rec.features[0].type - def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, use_name=False): + def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, use_name=False, cds_cleaning=False): new_feature_list = [] new_transcript_list = [] + kwargs = { + 'use_name': use_name, + 'disable_cds_recalculation': disable_cds_recalculation, + 'cds_cleaning': cds_cleaning, + } type = self._get_type(rec) log.debug("type " + str(type)) @@ -1260,8 +1265,7 @@ def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, if type in util.gene_types: log.debug("is gene type") if len(feature.sub_features) > 0: - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) log.debug("output feature data" + str(feature_data)) if isinstance(feature_data, list): new_transcript_list += feature_data @@ -1269,30 +1273,25 @@ def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, new_transcript_list.append(feature_data) else: log.debug("NO sub features, just adding directly") - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) log.debug("output feature data" + str(feature_data)) new_feature_list.append(feature_data) elif type in util.pseudogenes_types: - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) if isinstance(feature_data, list): new_feature_list += feature_data else: new_feature_list.append(feature_data) elif type in util.coding_transcript_types: - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) new_transcript_list.append(feature_data) elif type in util.noncoding_transcript_types: log.debug("a non-coding transcript") - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) new_feature_list.append(feature_data) log.debug("new feature list " + str(new_feature_list)) elif type in util.single_level_feature_types: - feature_data = util.yieldApolloData(feature, use_name=use_name, - disable_cds_recalculation=disable_cds_recalculation) + feature_data = util.yieldApolloData(feature, **kwargs) new_feature_list.append(feature_data) else: log.debug("unknown type " + type + " ") diff --git a/apollo/util.py b/apollo/util.py index f1b5c5a..8741a96 100644 --- a/apollo/util.py +++ b/apollo/util.py @@ -228,21 +228,27 @@ def _yieldNonCodingTranscriptData(features, disable_cds_recalculation=False, use # return _yieldSubFeatureData(features[0]) -def yieldApolloData(feature, use_name=False, disable_cds_recalculation=False): +def yieldApolloData(feature, use_name=False, disable_cds_recalculation=False, cds_cleaning=False): + # manually created a kwargs so we don't lose the actual method signature on yieldApolloData + kwargs = { + 'use_name': use_name, + 'disable_cds_recalculation': disable_cds_recalculation, + 'cds_cleaning': cds_cleaning, + } feature_type = _tnType(feature) if feature_type in gene_types: - return _yieldGeneData(feature) + return _yieldGeneData(feature, **kwargs) elif feature_type in pseudogenes_types: - return _yieldGeneData(feature) + return _yieldGeneData(feature, **kwargs) elif feature_type in coding_transcript_types: - return _yieldCodingTranscriptData(feature) + return _yieldCodingTranscriptData(feature, **kwargs) elif feature_type in noncoding_transcript_types: - return _yieldNonCodingTranscriptData(feature) + return _yieldNonCodingTranscriptData(feature, **kwargs) elif feature_type in single_level_feature_types: # return _yieldSingleLevelFeatureData(current_feature) - return _yieldSubFeatureData(feature) + return _yieldSubFeatureData(feature, **kwargs) else: - return _yieldSubFeatureData(feature) + return _yieldSubFeatureData(feature, **kwargs) # # if OGS: # # TODO: handle comments @@ -311,17 +317,23 @@ def add_property_to_feature(feature, property_key, property_value): return feature -def features_to_apollo_schema(features, use_name=False, disable_cds_recalculation=False): +def features_to_apollo_schema(features, use_name=False, disable_cds_recalculation=False, cds_cleaning=False): """ - :param disable_cds_recalculation: :param use_name: :param features: + :param cds_cleaning: :return: """ + kwargs = { + 'use_name': use_name, + 'disable_cds_recalculation': disable_cds_recalculation, + 'cds_cleaning': cds_cleaning, + } + compiled = [] for f in features: - compiled.append(yieldApolloData(f, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation)) + compiled.append(yieldApolloData(f, **kwargs)) return compiled From d6f08300545017557df7e04febc005179d9ef281 Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Wed, 31 Jan 2024 10:58:35 +0100 Subject: [PATCH 6/8] Finish propagation --- apollo/annotations/__init__.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/apollo/annotations/__init__.py b/apollo/annotations/__init__.py index 2bc6611..d7e383b 100644 --- a/apollo/annotations/__init__.py +++ b/apollo/annotations/__init__.py @@ -1302,6 +1302,7 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, + cds_cleaning=False, timing=False, ): """ @@ -1328,6 +1329,13 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1, :type disable_cds_recalculation: bool :param disable_cds_recalculation: Disable CDS recalculation and instead use the one provided + :type cds_cleaning: bool + :param cds_cleaning: This changes the behaviour of creating GFF3 + features in apollo to match more closely to what it expects. Generally + you'll probably want this on if you have transcripts with multiple + exons and CDSs, but we don't want to change existing scripts + so we are not defaulting this on. + :type timing: bool :param timing: Output loading performance metrics @@ -1360,7 +1368,8 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1, log.info("Processing %s with features: %s" % (rec.id, rec.features)) processed = self._process_gff_entry(rec, source=source, disable_cds_recalculation=disable_cds_recalculation, - use_name=use_name + use_name=use_name, + cds_cleaning=cds_cleaning ) all_processed['top-level'].extend(processed['top-level']) all_processed['transcripts'].extend(processed['transcripts']) From c5a47522be718c9e4dbd4a16c81142c3d91ead4c Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Wed, 31 Jan 2024 11:07:40 +0100 Subject: [PATCH 7/8] manually add --- arrow/commands/annotations/load_gff3.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arrow/commands/annotations/load_gff3.py b/arrow/commands/annotations/load_gff3.py index 0559664..48f0dcc 100644 --- a/arrow/commands/annotations/load_gff3.py +++ b/arrow/commands/annotations/load_gff3.py @@ -33,6 +33,11 @@ help="Disable CDS recalculation and instead use the one provided", is_flag=True ) +@click.option( + "--cds_cleaning", + help="This changes the behaviour of creating GFF3 features in apollo to match more closely to what it expects. Generally you'll probably want this on if you have transcripts with multiple exons and CDSs, but we don't want to change existing scripts so we are not defaulting this on." + is_flag=True +) @click.option( "--timing", help="Output loading performance metrics", @@ -41,11 +46,11 @@ @pass_context @custom_exception @str_output -def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, timing=False): +def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, cds_cleaning=False, timing=False): """Load a full GFF3 into annotation track Output: Loading report """ - return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, timing=timing) + return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, cds_cleaning=cds_cleaning, timing=timing) From 55287f083f3eda4e0550712add1b4bc5bb1e7e1b Mon Sep 17 00:00:00 2001 From: Helena Rasche Date: Wed, 31 Jan 2024 11:23:47 +0100 Subject: [PATCH 8/8] New version --- README.rst | 4 ++++ apollo/annotations/__init__.py | 2 +- arrow/__init__.py | 2 +- arrow/commands/annotations/load_gff3.py | 2 +- setup.py | 4 ++-- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index e0c28cb..bc74f9d 100644 --- a/README.rst +++ b/README.rst @@ -85,6 +85,10 @@ Or with the Arrow client: History ------- +- 4.3.0 + - Biopython requires are now more strict, there is a maximum that bcbio_gff supports + - loading GFF3 now has a new flag, --cds_cleaning, to enable more apollo conformant behaviour for multi-exon transcripts + (https://github.com/galaxy-genome-annotation/python-apollo/issues/60, https://github.com/galaxy-genome-annotation/python-apollo/pull/62) - 4.2.13 - Relax biopython requirements - 4.2.12 diff --git a/apollo/annotations/__init__.py b/apollo/annotations/__init__.py index d7e383b..4ac593e 100644 --- a/apollo/annotations/__init__.py +++ b/apollo/annotations/__init__.py @@ -1333,7 +1333,7 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1, :param cds_cleaning: This changes the behaviour of creating GFF3 features in apollo to match more closely to what it expects. Generally you'll probably want this on if you have transcripts with multiple - exons and CDSs, but we don't want to change existing scripts + exons and CDSs, but we don't want to change existing scripts so we are not defaulting this on. :type timing: bool diff --git a/arrow/__init__.py b/arrow/__init__.py index aa33e3a..5ee6158 100644 --- a/arrow/__init__.py +++ b/arrow/__init__.py @@ -1 +1 @@ -__version__ = '4.2.13' +__version__ = '4.3.0' diff --git a/arrow/commands/annotations/load_gff3.py b/arrow/commands/annotations/load_gff3.py index 48f0dcc..949b68e 100644 --- a/arrow/commands/annotations/load_gff3.py +++ b/arrow/commands/annotations/load_gff3.py @@ -35,7 +35,7 @@ ) @click.option( "--cds_cleaning", - help="This changes the behaviour of creating GFF3 features in apollo to match more closely to what it expects. Generally you'll probably want this on if you have transcripts with multiple exons and CDSs, but we don't want to change existing scripts so we are not defaulting this on." + help="This changes the behaviour of creating GFF3 features in apollo to match more closely to what it expects. Generally you'll probably want this on if you have transcripts with multiple exons and CDSs, but we don't want to change existing scripts so we are not defaulting this on.", is_flag=True ) @click.option( diff --git a/setup.py b/setup.py index e4cc11f..0bea9cc 100644 --- a/setup.py +++ b/setup.py @@ -16,11 +16,11 @@ setup( name="apollo", - version='4.2.13', + version='4.3.0', description="Apollo API library", long_description=readme, author="Helena Rasche;Anthony Bretaudeau;Nathan Dunn", - author_email="hxr@hx42.org", + author_email="hexylena@galaxians.org", url='https://github.com/galaxy-genome-annotation/python-apollo', packages=['apollo', 'arrow'] + subpackages, entry_points='''