Skip to content

Commit

Permalink
Merge pull request #55 from EBIvariation/feature/support-multiple-ref…
Browse files Browse the repository at this point in the history
…-seq-per-analysis

Multiple analysis support
  • Loading branch information
apriltuesday authored Jul 15, 2021
2 parents c5d9d42 + 153c554 commit cd8a11c
Show file tree
Hide file tree
Showing 35 changed files with 1,040 additions and 475 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,11 @@ python prepare_backlog_study.py --eload 506
```

If the files are valid then ingestion can be run as usual.

### Config upgrade

There is also a script to upgrade an existing config file for a submission so that it is compatible with the current version of the submission automation scripts.
This is automatically invoked when necessary, but it can also be run on its own, e.g. when an analysis alias can't be determined automatically.
```bash
python upgrade_config.py --eload 506 --analysis_alias alias
```
1 change: 1 addition & 0 deletions bin/broker_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def main():

# Optionally Set the valid VCF and metadata file
brokering = EloadBrokering(args.eload, args.vcf_files, args.metadata_file)
brokering.upgrade_config_if_needed()
if not args.report:
brokering.broker(brokering_tasks_to_force=args.force)
brokering.report()
Expand Down
9 changes: 8 additions & 1 deletion bin/ingest_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@ def main():
argparse.add_argument('--vep_cache_version', required=False, type=int,
help='VEP cache version to use for annotation. Only needed if running variant load.')
argparse.add_argument('--db_name', required=False, type=str,
help='Name of existing variant database in MongoDB. Only needed if adding a new database.')
help='Name of an existing variant database in MongoDB. Submission should have a single '
'assembly accession. Only needed if adding a new database. ex: db_name')
argparse.add_argument('--db_name_mapping', required=False, type=str, nargs='+',
help='List with the mapping for assembly accession and existing variant database in MongoDB.'
'Only needed if adding a new databases.'
'ex: GCA_000000001.1,db_name1 GCA_000000002.2,db_name2')
argparse.add_argument('--tasks', required=False, type=str, nargs='+',
default=EloadIngestion.all_tasks, choices=EloadIngestion.all_tasks,
help='Task or set of tasks to perform during ingestion.')
Expand All @@ -64,13 +69,15 @@ def main():
load_config()

ingestion = EloadIngestion(args.eload)
ingestion.upgrade_config_if_needed()
ingestion.ingest(
aggregation=args.aggregation,
instance_id=args.instance,
vep_version=args.vep_version,
vep_cache_version=args.vep_cache_version,
skip_annotation=args.skip_annotation,
db_name=args.db_name,
db_name_mapping=args.db_name_mapping,
tasks=args.tasks
)

Expand Down
2 changes: 1 addition & 1 deletion bin/prepare_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ebi_eva_common_pyutils.logger import logging_config as log_cfg

from eva_submission.submission_config import load_config
from eva_submission.eload_submission import EloadPreparation
from eva_submission.eload_preparation import EloadPreparation

logger = log_cfg.get_logger(__name__)

Expand Down
49 changes: 49 additions & 0 deletions bin/upgrade_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python

# Copyright 2021 EMBL - European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from argparse import ArgumentParser

from ebi_eva_common_pyutils.logger import logging_config as log_cfg

from eva_submission.eload_submission import Eload
from eva_submission.submission_config import load_config

logger = log_cfg.get_logger(__name__)


def main():
argparse = ArgumentParser(description='Upgrade ELOAD config to a format compatible with current automation')
argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission')
argparse.add_argument('--analysis_alias', required=False, type=str, help='Analysis alias to use')
argparse.add_argument('--debug', action='store_true', default=False,
help='Set the script to output logging information at debug level')

args = argparse.parse_args()

log_cfg.add_stdout_handler()
if args.debug:
log_cfg.set_log_level(logging.DEBUG)

# Load the config_file from default location
load_config()

eload = Eload(args.eload)
eload.upgrade_config_if_needed(args.analysis_alias)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion bin/validate_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def main():
load_config()

eload = EloadValidation(args.eload)
eload.upgrade_config_if_needed()
if not args.report:

eload.validate(args.validation_tasks, args.set_as_valid)
eload.report()

Expand Down
5 changes: 4 additions & 1 deletion eva_submission/ENA_submission/upload_to_ENA.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,10 @@ def parse_ena_receipt(self, ena_xml_receipt):
if child.tag == 'ERROR':
results['errors'].append(child.text)
for child in receipt:
if 'accession' in child.attrib:
# Store mapping from analysis accession to analysis alias.
if child.tag == 'ANALYSIS' and 'accession' in child.attrib:
results.setdefault(child.tag, {})[child.attrib['alias']] = child.attrib['accession']
elif 'accession' in child.attrib:
results[child.tag] = child.attrib['accession']
except ET.ParseError:
self.error('Cannot parse ENA receipt: ' + ena_xml_receipt)
Expand Down
1 change: 1 addition & 0 deletions eva_submission/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1.0.0
2 changes: 2 additions & 0 deletions eva_submission/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@

NEXTFLOW_DIR = os.path.join(os.path.dirname(os.path.abspath(eva_submission.__file__)), 'nextflow')
ETC_DIR = os.path.join(os.path.dirname(os.path.abspath(eva_submission.__file__)), 'etc')

__version__ = open(os.path.join(os.path.dirname(os.path.abspath(eva_submission.__file__)), 'VERSION')).read().strip()
70 changes: 70 additions & 0 deletions eva_submission/config_migration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from copy import deepcopy

from ebi_eva_common_pyutils.logger import logging_config as log_cfg

from eva_submission import __version__
from eva_submission.xlsx.xlsx_parser_eva import EvaXlsxReader


logger = log_cfg.get_logger(__name__)


def upgrade_version_0_1(eload_cfg, analysis_alias=None):
"""
Upgrades a version 0.x config to version 1, using the provided analysis alias for all files.
"""
if 'submission' not in eload_cfg:
logger.error('Need submission config section to upgrade')
logger.error('Try running prepare_submission or prepare_backlog_study to build a config from scratch.')
raise ValueError('Need submission config section to upgrade')

# Note: if we're converting an old config, there's only one analysis
if not analysis_alias:
analysis_alias = get_analysis_alias_from_metadata(eload_cfg)
analysis_data = {
'assembly_accession': eload_cfg.pop('submission', 'assembly_accession'),
'assembly_fasta': eload_cfg.pop('submission', 'assembly_fasta'),
'assembly_report': eload_cfg.pop('submission', 'assembly_report'),
'vcf_files': eload_cfg.pop('submission', 'vcf_files')
}
analysis_dict = {analysis_alias: analysis_data}
eload_cfg.set('submission', 'analyses', value=analysis_dict)

if 'validation' in eload_cfg:
eload_cfg.pop('validation', 'valid', 'vcf_files')
eload_cfg.set('validation', 'valid', 'analyses', value=analysis_dict)

if 'brokering' in eload_cfg:
brokering_vcfs = {
vcf_file: index_dict
for vcf_file, index_dict in eload_cfg.pop('brokering', 'vcf_files').items()
}
brokering_analyses = deepcopy(analysis_dict)
brokering_analyses[analysis_alias]['vcf_files'] = brokering_vcfs
eload_cfg.set('brokering', 'analyses', value=brokering_analyses)
analysis_accession = eload_cfg.pop('brokering', 'ena', 'ANALYSIS')
eload_cfg.set('brokering', 'ena', 'ANALYSIS', analysis_alias, value=analysis_accession)

# Set version once we've successfully upgraded
eload_cfg.set('version', value=__version__)


def get_analysis_alias_from_metadata(eload_cfg):
"""
Returns analysis alias only if we find a metadata spreadsheet and it has exactly one analysis.
Otherwise provides an error message and raise an error.
"""
metadata_spreadsheet = eload_cfg.query('validation', 'valid', 'metadata_spreadsheet')
if metadata_spreadsheet:
reader = EvaXlsxReader(metadata_spreadsheet)
if len(reader.analysis) == 1:
return reader.analysis[0].get('Analysis Alias')

if len(reader.analysis) > 1:
logger.error("Can't assign analysis alias: multiple analyses found in metadata!")
else:
logger.error("Can't assign analysis alias: no analyses found in metadata!")
else:
logger.error("Can't assign analysis alias: no metadata found!")
logger.error("Try running upgrade_config and passing an analysis alias explicitly.")
raise ValueError("Can't find an analysis alias for config upgrade.")
59 changes: 34 additions & 25 deletions eva_submission/eload_backlog.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,18 @@ def get_species_info(self):
self.eload_cfg.set('submission', 'scientific_name', value=sci_name)

with self.metadata_connection_handle as conn:
query = f"select distinct b.vcf_reference_accession " \
query = f"select distinct b.analysis_accession, b.vcf_reference_accession " \
f"from project_analysis a " \
f"join analysis b on a.analysis_accession=b.analysis_accession " \
f"where a.project_accession='{self.project_accession}' and b.hidden_in_eva=0;"
rows = get_all_results_for_query(conn, query)
if len(rows) < 1:
raise ValueError(f'No reference accession for {self.project_accession} found in metadata DB.')
elif len(rows) > 1:
raise ValueError(f'Multiple reference accession for {self.project_accession} found in metadata DB.')
asm_accession, = rows[0]
self.eload_cfg.set('submission', 'assembly_accession', value=asm_accession)
fasta_path, report_path = get_reference_fasta_and_report(sci_name, asm_accession)
self.eload_cfg.set('submission', 'assembly_fasta', value=fasta_path)
self.eload_cfg.set('submission', 'assembly_report', value=report_path)
for analysis_accession, asm_accession in rows:
self.eload_cfg.set('submission', 'analyses', analysis_accession, 'assembly_accession', value=asm_accession)
fasta_path, report_path = get_reference_fasta_and_report(sci_name, asm_accession)
self.eload_cfg.set('submission', 'analyses', analysis_accession, 'assembly_fasta', value=fasta_path)
self.eload_cfg.set('submission', 'analyses', analysis_accession, 'assembly_report', value=report_path)

def find_local_file(self, fn):
full_path = os.path.join(self._get_dir('vcf'), fn)
Expand Down Expand Up @@ -99,16 +97,16 @@ def get_analysis_info(self):
f"from project_analysis a " \
f"join analysis_file b on a.analysis_accession=b.analysis_accession " \
f"join file c on b.file_id=c.file_id " \
f"where a.project_accession='{self.project_accession}' and a.hidden_in_eva=0" \
f"join analysis d on a.analysis_accession=d.analysis_accession " \
f"where a.project_accession='{self.project_accession}' and d.hidden_in_eva=0" \
f"group by a.analysis_accession;"
rows = get_all_results_for_query(conn, query)
if len(rows) == 0:
raise ValueError(f'No analyses for {self.project_accession} found in metadata DB.')
raise ValueError(f'No analysis for {self.project_accession} found in metadata DB.')

submitted_vcfs = []
for analysis_accession, filenames in rows:
# TODO for now we assume a single analysis per project as that's what the eload config supports
self.eload_cfg.set('brokering', 'ena', 'ANALYSIS', value=analysis_accession)
# Uses the analysis accession as analysis alias
self.eload_cfg.set('brokering', 'ena', 'ANALYSIS', analysis_accession, value=analysis_accession)
vcf_file_list = []
index_file_dict = {}
for fn in filenames:
Expand All @@ -128,30 +126,41 @@ def get_analysis_info(self):
basename = os.path.basename(vcf_file)
if basename not in index_file_dict:
raise ValueError(f'Index file is missing from metadata DB for vcf {basename} analysis {analysis_accession}')
submitted_vcfs.append(vcf_file)
self.eload_cfg.set('brokering', 'vcf_files', vcf_file, 'index', value=index_file_dict.pop(basename))
self.eload_cfg.set('brokering', 'analyses', analysis_accession, 'vcf_files', vcf_file, 'index',
value=index_file_dict.pop(basename))

# Check if there are any orphaned index
if len(index_file_dict) > 0:
raise ValueError(f'VCF file is missing from metadata DB for index {", ".join(index_file_dict.values())}'
f' for analysis {analysis_accession}')
self.eload_cfg.set('submission', 'vcf_files', value=submitted_vcfs)
# Using analysis_accession instead of analysis alias. This should not have any detrimental effect on
# ingestion
self.eload_cfg.set('submission', 'analyses', analysis_accession, 'vcf_files', value=vcf_file_list)

def _analysis_report(self, all_analysis):
reports = []
for analysis_accession in all_analysis:
assembly = all_analysis.get(analysis_accession).get('assembly_accession', '')
fasta = all_analysis.get(analysis_accession).get('assembly_fasta', '')
vcf_files_str = '\n'.join(all_analysis.get(analysis_accession).get('vcf_files', []))
reports.append(f"""{analysis_accession}
- Assembly: {assembly}
- Fasta file: {fasta}
- VCF file:
{vcf_files_str}""")
return '\n'.join(reports)

def report(self):
"""Collect information from the config and write the report."""
report_data = {
'project': self.eload_cfg.query('brokering', 'ena', 'PROJECT'),
'analysis': self.eload_cfg.query('brokering', 'ena', 'ANALYSIS'),
'vcf': self.eload_cfg.query('submission', 'vcf_files'),
'assembly': self.eload_cfg.query('submission', 'assembly_accession'),
'fasta': self.eload_cfg.query('submission', 'assembly_fasta')
'project': self.eload_cfg.query('brokering', 'ena', 'PROJECT', ret_default=''),
'analyses': ', '.join(self.eload_cfg.query('brokering', 'ena', 'ANALYSIS', ret_default=[])),
'analyses_report': self._analysis_report(self.eload_cfg.query('submission', 'analyses', ret_default=[]))
}

report = """Results of backlog study preparation:
Project accession: {project}
Assembly: {assembly}
Fasta file: {fasta}
Analysis accession: {analysis}
VCF file: {vcf}
Analysis accession(s): {analyses}
Analysis information: {analyses_report}
"""
print(report.format(**report_data))
Loading

0 comments on commit cd8a11c

Please sign in to comment.