Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Explore dropping CDSs #62

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ Or with the Arrow client:
History
-------

- 4.3.0
- Biopython requires are now more strict, there is a maximum that bcbio_gff supports
- loading GFF3 now has a new flag, --cds_cleaning, to enable more apollo conformant behaviour for multi-exon transcripts
(https://github.com/galaxy-genome-annotation/python-apollo/issues/60, https://github.com/galaxy-genome-annotation/python-apollo/pull/62)
- 4.2.13
- Relax biopython requirements
- 4.2.12
Expand Down
36 changes: 22 additions & 14 deletions apollo/annotations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1247,10 +1247,15 @@ def _get_type(self, rec):
def _get_subfeature_type(self, rec):
return rec.features[0].type

def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, use_name=False):
def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False, use_name=False, cds_cleaning=False):

new_feature_list = []
new_transcript_list = []
kwargs = {
'use_name': use_name,
'disable_cds_recalculation': disable_cds_recalculation,
'cds_cleaning': cds_cleaning,
}

type = self._get_type(rec)
log.debug("type " + str(type))
Expand All @@ -1260,39 +1265,33 @@ def _process_gff_entry(self, rec, source=None, disable_cds_recalculation=False,
if type in util.gene_types:
log.debug("is gene type")
if len(feature.sub_features) > 0:
feature_data = util.yieldApolloData(feature, use_name=use_name,
disable_cds_recalculation=disable_cds_recalculation)
feature_data = util.yieldApolloData(feature, **kwargs)
log.debug("output feature data" + str(feature_data))
if isinstance(feature_data, list):
new_transcript_list += feature_data
else:
new_transcript_list.append(feature_data)
else:
log.debug("NO sub features, just adding directly")
feature_data = util.yieldApolloData(feature, use_name=use_name,
disable_cds_recalculation=disable_cds_recalculation)
feature_data = util.yieldApolloData(feature, **kwargs)
log.debug("output feature data" + str(feature_data))
new_feature_list.append(feature_data)
elif type in util.pseudogenes_types:
feature_data = util.yieldApolloData(feature, use_name=use_name,
disable_cds_recalculation=disable_cds_recalculation)
feature_data = util.yieldApolloData(feature, **kwargs)
if isinstance(feature_data, list):
new_feature_list += feature_data
else:
new_feature_list.append(feature_data)
elif type in util.coding_transcript_types:
feature_data = util.yieldApolloData(feature, use_name=use_name,
disable_cds_recalculation=disable_cds_recalculation)
feature_data = util.yieldApolloData(feature, **kwargs)
new_transcript_list.append(feature_data)
elif type in util.noncoding_transcript_types:
log.debug("a non-coding transcript")
feature_data = util.yieldApolloData(feature, use_name=use_name,
disable_cds_recalculation=disable_cds_recalculation)
feature_data = util.yieldApolloData(feature, **kwargs)
new_feature_list.append(feature_data)
log.debug("new feature list " + str(new_feature_list))
elif type in util.single_level_feature_types:
feature_data = util.yieldApolloData(feature, use_name=use_name,
disable_cds_recalculation=disable_cds_recalculation)
feature_data = util.yieldApolloData(feature, **kwargs)
new_feature_list.append(feature_data)
else:
log.debug("unknown type " + type + " ")
Expand All @@ -1303,6 +1302,7 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1,
test=False,
use_name=False,
disable_cds_recalculation=False,
cds_cleaning=False,
timing=False,
):
"""
Expand All @@ -1329,6 +1329,13 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1,
:type disable_cds_recalculation: bool
:param disable_cds_recalculation: Disable CDS recalculation and instead use the one provided

:type cds_cleaning: bool
:param cds_cleaning: This changes the behaviour of creating GFF3
features in apollo to match more closely to what it expects. Generally
you'll probably want this on if you have transcripts with multiple
exons and CDSs, but we don't want to change existing scripts
so we are not defaulting this on.

:type timing: bool
:param timing: Output loading performance metrics

Expand Down Expand Up @@ -1361,7 +1368,8 @@ def load_gff3(self, organism, gff3, source=None, batch_size=1,
log.info("Processing %s with features: %s" % (rec.id, rec.features))
processed = self._process_gff_entry(rec, source=source,
disable_cds_recalculation=disable_cds_recalculation,
use_name=use_name
use_name=use_name,
cds_cleaning=cds_cleaning
)
all_processed['top-level'].extend(processed['top-level'])
all_processed['transcripts'].extend(processed['transcripts'])
Expand Down
57 changes: 46 additions & 11 deletions apollo/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def _tnType(feature):
return 'exon'


def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False):
def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False, cds_cleaning=False):
current = _yieldSubFeatureData(gene, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name)

if gene.sub_features:
Expand All @@ -122,7 +122,30 @@ def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False):
# # TODO: handle GO, Gene Product, Provenance

if 'children' in current and gene.type == 'gene':
if not cds_cleaning:
return current['children']

# Only sending mRNA level as apollo is more comfortable with orphan mRNAs
for mRNA in current['children']:
new_mRNA_children = []
new_cds = None
for feature in mRNA['children']:
if feature['type']['name'] == 'CDS':
if new_cds:
new_cds_start = new_cds['location']['fmin']
new_cds_end = new_cds['location']['fmax']
this_cds_start = feature['location']['fmin']
this_cds_end = feature['location']['fmax']
new_cds['location']['fmin'] = min(new_cds_start, this_cds_start)
new_cds['location']['fmax'] = max(new_cds_end, this_cds_end)
else:
new_cds = feature
else:
new_mRNA_children.append(feature)
if new_cds:
mRNA['children'] = new_mRNA_children
mRNA['children'].append(new_cds)

return current['children']
else:
# No children, return a generic gene feature
Expand Down Expand Up @@ -205,21 +228,27 @@ def _yieldNonCodingTranscriptData(features, disable_cds_recalculation=False, use
# return _yieldSubFeatureData(features[0])


def yieldApolloData(feature, use_name=False, disable_cds_recalculation=False):
def yieldApolloData(feature, use_name=False, disable_cds_recalculation=False, cds_cleaning=False):
# manually created a kwargs so we don't lose the actual method signature on yieldApolloData
kwargs = {
'use_name': use_name,
'disable_cds_recalculation': disable_cds_recalculation,
'cds_cleaning': cds_cleaning,
}
feature_type = _tnType(feature)
if feature_type in gene_types:
return _yieldGeneData(feature)
return _yieldGeneData(feature, **kwargs)
elif feature_type in pseudogenes_types:
return _yieldGeneData(feature)
return _yieldGeneData(feature, **kwargs)
elif feature_type in coding_transcript_types:
return _yieldCodingTranscriptData(feature)
return _yieldCodingTranscriptData(feature, **kwargs)
elif feature_type in noncoding_transcript_types:
return _yieldNonCodingTranscriptData(feature)
return _yieldNonCodingTranscriptData(feature, **kwargs)
elif feature_type in single_level_feature_types:
# return _yieldSingleLevelFeatureData(current_feature)
return _yieldSubFeatureData(feature)
return _yieldSubFeatureData(feature, **kwargs)
else:
return _yieldSubFeatureData(feature)
return _yieldSubFeatureData(feature, **kwargs)

# # if OGS:
# # TODO: handle comments
Expand Down Expand Up @@ -288,17 +317,23 @@ def add_property_to_feature(feature, property_key, property_value):
return feature


def features_to_apollo_schema(features, use_name=False, disable_cds_recalculation=False):
def features_to_apollo_schema(features, use_name=False, disable_cds_recalculation=False, cds_cleaning=False):
"""

:param disable_cds_recalculation:
:param use_name:
:param features:
:param cds_cleaning:
:return:
"""
kwargs = {
'use_name': use_name,
'disable_cds_recalculation': disable_cds_recalculation,
'cds_cleaning': cds_cleaning,
}

compiled = []
for f in features:
compiled.append(yieldApolloData(f, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation))
compiled.append(yieldApolloData(f, **kwargs))
return compiled


Expand Down
2 changes: 1 addition & 1 deletion arrow/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '4.2.13'
__version__ = '4.3.0'
9 changes: 7 additions & 2 deletions arrow/commands/annotations/load_gff3.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
help="Disable CDS recalculation and instead use the one provided",
is_flag=True
)
@click.option(
"--cds_cleaning",
help="This changes the behaviour of creating GFF3 features in apollo to match more closely to what it expects. Generally you'll probably want this on if you have transcripts with multiple exons and CDSs, but we don't want to change existing scripts so we are not defaulting this on.",
is_flag=True
)
@click.option(
"--timing",
help="Output loading performance metrics",
Expand All @@ -41,11 +46,11 @@
@pass_context
@custom_exception
@str_output
def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, timing=False):
def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, cds_cleaning=False, timing=False):
"""Load a full GFF3 into annotation track

Output:

Loading report
"""
return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, timing=timing)
return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, cds_cleaning=cds_cleaning, timing=timing)
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@

setup(
name="apollo",
version='4.2.13',
version='4.3.0',
description="Apollo API library",
long_description=readme,
author="Helena Rasche;Anthony Bretaudeau;Nathan Dunn",
author_email="hxr@hx42.org",
author_email="hexylena@galaxians.org",
url='https://github.com/galaxy-genome-annotation/python-apollo',
packages=['apollo', 'arrow'] + subpackages,
entry_points='''
Expand Down
Loading