diff --git a/README.md b/README.md index d7f8638..c7849cc 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,76 @@ # eva-sub-cli EVA Submission Command Line Interface for Validation + + + + +## Installation + +TBD + +## Input files for the validation and submission tool + +### The VCF file and association with reference genome + +The path to the VCF files are provided via CSV file that links the VCF to their respective fasta sequence. This allows +us to support different assemblies for each VCF file +The CSV file `vcf_mapping.csv` contains the following columns vcf, fasta, report providing respectively: + - The VCF to validate/upload + - The assembly in fasta format that was used to derive the VCF + - The assembly report associated with the assembly (if available) as found in NCBI assemblies (https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#files) + + +Example: +```shell +vcf,fasta,report +/full/path/to/vcf_file1.vcf,/full/path/to/genome.fa,/full/path/to/genome_assembly_report.txt +/full/path/to/vcf_file2.vcf,/full/path/to/genome.fa,/full/path/to/genome_assembly_report.txt +/full/path/to/vcf_file3.vcf,/full/path/to/genome2.fa,/full/path/to/genome_assembly_report2.txt +``` + +### The metadata spreadsheet + +The metadata template can be found within the etc folder at `eva_sub_cli/etc/EVA_Submission_template.xlsx` +It should be populated following the instruction provided within the template + +### The metadata JSON + +The metadata can also be provided via a JSON file which should conform to the schema located at +`eva_sub_cli/etc/eva_schema.json` + +More detail documentation to follow + +## Execution + +### Validate and submit your dataset + +To validate and submit run the following command + +```shell +eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \ + --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir +``` + +### Validate only + +To validate and not submit run the following command + +```shell +eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \ + --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir + --tasks VALIDATE +``` +### Submit only + +All submission must have been validated. You cannot run the submission without validation. Once validated running + +```shell +eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \ + --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir +``` +or +```shell +eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx \ + --vcf_files_mapping vcf_mapping.csv --submission_dir submission_dir --tasks SUBMIT +``` +Will only submit the data and not validate. diff --git a/bin/eva-sub-cli.py b/bin/eva-sub-cli.py index 6f20326..cefd2fb 100755 --- a/bin/eva-sub-cli.py +++ b/bin/eva-sub-cli.py @@ -1,36 +1,19 @@ #!/usr/bin/env python -import csv -import os + from argparse import ArgumentParser -from ebi_eva_common_pyutils.config import WritableConfig from ebi_eva_common_pyutils.logger import logging_config -from eva_sub_cli import SUB_CLI_CONFIG_FILE, __version__ -from eva_sub_cli.docker_validator import DockerValidator, docker_path, container_image -from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA -from eva_sub_cli.submit import StudySubmitter - -VALIDATE = 'validate' -SUBMIT = 'submit' -RESUME_SUBMISSION = 'resume_submission' - -logging_config.add_stdout_handler() - - -def get_vcf_files(mapping_file): - vcf_files = [] - with open(mapping_file) as open_file: - reader = csv.DictReader(open_file, delimiter=',') - for row in reader: - vcf_files.append(row['vcf']) - return vcf_files +from eva_sub_cli import main +from eva_sub_cli.main import VALIDATE, SUBMIT if __name__ == "__main__": argparser = ArgumentParser(description='EVA Submission CLI - validate and submit data to EVA') - argparser.add_argument('--task', required=True, choices=[VALIDATE, SUBMIT, RESUME_SUBMISSION], - help='Select a task to perform') + argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT], + help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the outcome of ' + 'previous runs. Selecting SUBMIT will run validate only if the validation was not performed ' + 'successfully before and then run the submission.') argparser.add_argument('--submission_dir', required=True, type=str, help='Full path to the directory where all processing will be done ' 'and submission info is/will be stored') @@ -41,38 +24,17 @@ def get_vcf_files(mapping_file): help="Json file that describe the project, analysis, samples and files") group.add_argument("--metadata_xlsx", help="Excel spreadsheet that describe the project, analysis, samples and files") - group.add_argument("--username", - help="Username used for connecting to the ENA webin account") - group.add_argument("--password", - help="Password used for connecting to the ENA webin account") + argparser.add_argument("--username", + help="Username used for connecting to the ENA webin account") + argparser.add_argument("--password", + help="Password used for connecting to the ENA webin account") + argparser.add_argument("--resume", default=False, action='store_true', + help="Resume the process execution from where it left of. This is currently only supported " + "for the upload part of the SUBMIT task.") args = argparser.parse_args() - # load config - config_file_path = os.path.join(args.submission_dir, SUB_CLI_CONFIG_FILE) - sub_config = WritableConfig(config_file_path, version=__version__) - - vcf_files = get_vcf_files(args.vcf_files_mapping) - metadata_file = args.metadata_json or args.metadata_xlsx - - if args.task == RESUME_SUBMISSION: - # if validation is not passed, process task submit (validate and submit) - if READY_FOR_SUBMISSION_TO_EVA not in sub_config or not sub_config[READY_FOR_SUBMISSION_TO_EVA]: - args.task = SUBMIT - else: - # if validation is passed, upload files without validating again - with StudySubmitter(args.submission_dir, vcf_files, metadata_file, submission_config=sub_config, - username=args.username, password=args.password) as submitter: - submitter.upload_submission() - - if args.task == VALIDATE or args.task == SUBMIT: - with DockerValidator(args.vcf_files_mapping, args.submission_dir, args.metadata_json, args.metadata_xlsx, - submission_config=sub_config) as validator: - validator.validate() - validator.create_reports() - validator.update_config_with_validation_result() + logging_config.add_stdout_handler() - if args.task == SUBMIT: - with StudySubmitter(args.submission_dir, vcf_files, metadata_file, submission_config=sub_config, - username=args.username, password=args.password) as submitter: - submitter.submit() + main.orchestrate_process(args.submission_dir, args.vcf_files_mapping, args.metadata_json, args.metadata_xlsx, + args.tasks, args.resume) diff --git a/eva_sub_cli/auth.py b/eva_sub_cli/auth.py index ee0f35c..3f8eb9d 100644 --- a/eva_sub_cli/auth.py +++ b/eva_sub_cli/auth.py @@ -102,5 +102,5 @@ def get_auth(username=None, password=None): global auth if auth: return auth - auth = WebinAuth(username, password) + auth = WebinAuth(username=username, password=password) return auth diff --git a/eva_sub_cli/etc/EVA_Submission_template.xlsx b/eva_sub_cli/etc/EVA_Submission_template.xlsx new file mode 100644 index 0000000..6b2d0ef Binary files /dev/null and b/eva_sub_cli/etc/EVA_Submission_template.xlsx differ diff --git a/eva_sub_cli/main.py b/eva_sub_cli/main.py new file mode 100755 index 0000000..4f19aa0 --- /dev/null +++ b/eva_sub_cli/main.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +import csv +import os +from ebi_eva_common_pyutils.config import WritableConfig +from ebi_eva_common_pyutils.logger import logging_config + +from eva_sub_cli import SUB_CLI_CONFIG_FILE, __version__ +from eva_sub_cli.docker_validator import DockerValidator +from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA +from eva_sub_cli.submit import StudySubmitter + +VALIDATE = 'validate' +SUBMIT = 'submit' + +def get_vcf_files(mapping_file): + vcf_files = [] + with open(mapping_file) as open_file: + reader = csv.DictReader(open_file, delimiter=',') + for row in reader: + vcf_files.append(row['vcf']) + return vcf_files + + +def orchestrate_process(submission_dir, vcf_files_mapping, metadata_json, metadata_xlsx, tasks, resume): + # load config + config_file_path = os.path.join(submission_dir, SUB_CLI_CONFIG_FILE) + sub_config = WritableConfig(config_file_path, version=__version__) + + metadata_file = metadata_json or metadata_xlsx + vcf_files = get_vcf_files(vcf_files_mapping) + + # Validation is mandatory so if submit is requested then VALIDATE must have run before or be requested as well + if SUBMIT in tasks and not sub_config.get(READY_FOR_SUBMISSION_TO_EVA, False): + if VALIDATE not in tasks: + tasks.append(VALIDATE) + + if VALIDATE in tasks: + with DockerValidator(vcf_files_mapping, submission_dir, metadata_json, metadata_xlsx, + submission_config=sub_config) as validator: + validator.validate() + validator.create_reports() + validator.update_config_with_validation_result() + if SUBMIT in tasks: + with StudySubmitter(submission_dir, vcf_files, metadata_file, submission_config=sub_config) as submitter: + submitter.submit(resume=resume) diff --git a/eva_sub_cli/submit.py b/eva_sub_cli/submit.py index e652253..7a9b8ad 100644 --- a/eva_sub_cli/submit.py +++ b/eva_sub_cli/submit.py @@ -41,20 +41,19 @@ def update_config_with_submission_id_and_upload_url(self, submission_id, upload_ self.sub_config.set(SUB_CLI_CONFIG_KEY_SUBMISSION_ID, value=submission_id) self.sub_config.set(SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL, value=upload_url) - def upload_submission(self, submission_upload_url=None): + def _upload_submission(self): if READY_FOR_SUBMISSION_TO_EVA not in self.sub_config or not self.sub_config[READY_FOR_SUBMISSION_TO_EVA]: raise Exception(f'There are still validation errors that needs to be addressed. ' f'Please review, address and re-validate before uploading.') - if not submission_upload_url: - submission_upload_url = self.sub_config[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL] + submission_upload_url = self.sub_config[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL] for f in self.vcf_files: - self.upload_file(submission_upload_url, f) - self.upload_file(submission_upload_url, self.metadata_file) + self._upload_file(submission_upload_url, f) + self._upload_file(submission_upload_url, self.metadata_file) @retry(tries=5, delay=10, backoff=5) - def upload_file(self, submission_upload_url, input_file): + def _upload_file(self, submission_upload_url, input_file): base_name = os.path.basename(input_file) self.info(f'Transfer {base_name} to EVA FTP') r = requests.put(urljoin(submission_upload_url, base_name), data=open(input_file, 'rb')) @@ -67,21 +66,20 @@ def verify_submission_dir(self, submission_dir): if not os.access(submission_dir, os.W_OK): raise Exception(f"The directory '{submission_dir}' does not have write permissions.") - def submit(self): + def submit(self, resume=False): if READY_FOR_SUBMISSION_TO_EVA not in self.sub_config or not self.sub_config[READY_FOR_SUBMISSION_TO_EVA]: raise Exception(f'There are still validation errors that need to be addressed. ' f'Please review, address and re-validate before submitting.') - - self.verify_submission_dir(self.submission_dir) - response = requests.post(self.submission_initiate_url, - headers={'Accept': 'application/hal+json', - 'Authorization': 'Bearer ' + self.auth.token}) - response.raise_for_status() - response_json = response.json() - self.info("Submission ID {} received!!".format(response_json["submissionId"])) - - # update config with submission id and upload url - self.update_config_with_submission_id_and_upload_url(response_json["submissionId"], response_json["uploadUrl"]) + if not (resume or self.sub_config.get(SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL)): + self.verify_submission_dir(self.submission_dir) + response = requests.post(self.submission_initiate_url, + headers={'Accept': 'application/hal+json', + 'Authorization': 'Bearer ' + self.auth.token}) + response.raise_for_status() + response_json = response.json() + self.info("Submission ID {} received!!".format(response_json["submissionId"])) + # update config with submission id and upload url + self.update_config_with_submission_id_and_upload_url(response_json["submissionId"], response_json["uploadUrl"]) # upload submission - self.upload_submission(response_json["uploadUrl"]) + self._upload_submission() diff --git a/requirements.txt b/requirements.txt index 7c69970..5be23ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ pyyaml jinja2 -minify_html +minify_html==0.11.1 openpyxl requests jsonschema diff --git a/tests/resources/EVA_Submission_template.V1.1.4.xlsx b/tests/resources/EVA_Submission_test.xlsx similarity index 100% rename from tests/resources/EVA_Submission_template.V1.1.4.xlsx rename to tests/resources/EVA_Submission_test.xlsx diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py index f75e63e..6863417 100644 --- a/tests/test_docker_validator.py +++ b/tests/test_docker_validator.py @@ -62,7 +62,7 @@ def setUp(self): container_name='eva-sub-cli-test' ) shutil.copyfile( - os.path.join(self.resources_folder, 'EVA_Submission_template.V1.1.4.xlsx'), + os.path.join(self.resources_folder, 'EVA_Submission_test.xlsx'), self.metadata_xlsx ) diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..1e63dd2 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,93 @@ +import json +import os +import shutil +import unittest +from unittest.mock import MagicMock, patch, Mock + +import yaml +from ebi_eva_common_pyutils.config import WritableConfig + +from eva_sub_cli import SUB_CLI_CONFIG_FILE +from eva_sub_cli.main import orchestrate_process, VALIDATE, SUBMIT +from eva_sub_cli.reporter import READY_FOR_SUBMISSION_TO_EVA +from eva_sub_cli.submit import StudySubmitter, SUB_CLI_CONFIG_KEY_SUBMISSION_ID, SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL + + +class TestMain(unittest.TestCase): + resource_dir = os.path.join(os.path.dirname(__file__), 'resources') + test_sub_dir = os.path.join(resource_dir, 'test_sub_dir') + config_file = os.path.join(test_sub_dir, SUB_CLI_CONFIG_FILE) + + mapping_file = os.path.join(test_sub_dir, 'vcf_files_metadata.csv') + metadata_json = os.path.join(test_sub_dir, 'sub_metadata.json') + metadata_xlsx = os.path.join(test_sub_dir, 'sub_metadata.xlsx') + + def test_orchestrate_validate(self): + with patch('eva_sub_cli.main.get_vcf_files') as m_get_vcf, \ + patch('eva_sub_cli.main.WritableConfig') as m_config, \ + patch('eva_sub_cli.main.DockerValidator') as m_docker_validator: + orchestrate_process( + self.test_sub_dir, self.mapping_file, self.metadata_json, self.metadata_xlsx, tasks=[VALIDATE], + resume=False + ) + m_get_vcf.assert_called_once_with(self.mapping_file) + m_docker_validator.assert_any_call( + self.mapping_file, self.test_sub_dir, self.metadata_json, self.metadata_xlsx, + submission_config=m_config.return_value + ) + with m_docker_validator() as validator: + validator.validate.assert_called_once_with() + validator.create_reports.assert_called_once_with() + validator.update_config_with_validation_result.assert_called_once_with() + + + def test_orchestrate_validate_submit(self): + with patch('eva_sub_cli.main.get_vcf_files') as m_get_vcf, \ + patch('eva_sub_cli.main.WritableConfig') as m_config, \ + patch('eva_sub_cli.main.DockerValidator') as m_docker_validator, \ + patch('eva_sub_cli.main.StudySubmitter') as m_submitter: + # Empty config + m_config.return_value = {} + + orchestrate_process( + self.test_sub_dir, self.mapping_file, self.metadata_json, self.metadata_xlsx, tasks=[SUBMIT], + resume=False + ) + m_get_vcf.assert_called_once_with(self.mapping_file) + # Validate was run because the config show it was not run successfully before + m_docker_validator.assert_any_call( + self.mapping_file, self.test_sub_dir, self.metadata_json, self.metadata_xlsx, + submission_config=m_config.return_value + ) + with m_docker_validator() as validator: + validator.validate.assert_called_once_with() + validator.create_reports.assert_called_once_with() + validator.update_config_with_validation_result.assert_called_once_with() + + # Submit was created + m_submitter.assert_any_call(self.test_sub_dir, m_get_vcf.return_value, self.metadata_json, + submission_config=m_config.return_value) + with m_submitter() as submitter: + submitter.submit.assert_called_once_with(resume=False) + + def test_orchestrate_submit_no_validate(self): + with patch('eva_sub_cli.main.get_vcf_files') as m_get_vcf, \ + patch('eva_sub_cli.main.WritableConfig') as m_config, \ + patch('eva_sub_cli.main.DockerValidator') as m_docker_validator, \ + patch('eva_sub_cli.main.StudySubmitter') as m_submitter: + # Empty config + m_config.return_value = {READY_FOR_SUBMISSION_TO_EVA: True} + + orchestrate_process( + self.test_sub_dir, self.mapping_file, self.metadata_json, self.metadata_xlsx, tasks=[SUBMIT], + resume=False + ) + m_get_vcf.assert_called_once_with(self.mapping_file) + # Validate was not run because the config showed it was run successfully before + assert m_docker_validator.call_count == 0 + + # Submit was created + m_submitter.assert_any_call(self.test_sub_dir, m_get_vcf.return_value, self.metadata_json, + submission_config=m_config.return_value) + with m_submitter() as submitter: + submitter.submit.assert_called_once_with(resume=False) diff --git a/tests/test_submit.py b/tests/test_submit.py index 84970ed..ab7864f 100644 --- a/tests/test_submit.py +++ b/tests/test_submit.py @@ -21,7 +21,7 @@ def setUp(self) -> None: self.token = 'a token' with patch('eva_sub_cli.submit.get_auth', return_value=Mock(token=self.token)): vcf_files = [os.path.join(self.resource_dir, 'vcf_files', 'example2.vcf.gz')] - metadata_file = os.path.join(self.resource_dir, 'EVA_Submission_template.V1.1.4.xlsx') + metadata_file = os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx') self.submitter = StudySubmitter(submission_dir=self.test_sub_dir, vcf_files=vcf_files, metadata_file=metadata_file) @@ -40,7 +40,7 @@ def test_submit(self): # Set the side_effect attribute to return different responses with patch('eva_sub_cli.submit.requests.post', return_value=mock_submit_response) as mock_post, \ - patch.object(StudySubmitter, 'upload_submission'), \ + patch.object(StudySubmitter, '_upload_submission'), \ patch.object(StudySubmitter, 'verify_submission_dir'), \ patch.object(StudySubmitter, 'update_config_with_submission_id_and_upload_url'), \ patch.object(self.submitter, 'sub_config', {READY_FOR_SUBMISSION_TO_EVA: True}), \ @@ -63,7 +63,7 @@ def test_submit_with_config(self): sub_config.write() with patch('eva_sub_cli.submit.requests.post', return_value=mock_submit_response) as mock_post, \ - patch.object(StudySubmitter, 'upload_submission'): + patch.object(StudySubmitter, '_upload_submission'): with self.submitter as submitter: submitter.submit() @@ -102,10 +102,10 @@ def test_upload_submission(self): mock_submit_response = MagicMock() mock_submit_response.status_code = 200 test_url = 'http://example.com/' - with patch.object(StudySubmitter, 'upload_file') as mock_upload_file, \ + with patch.object(StudySubmitter, '_upload_file') as mock_upload_file, \ patch.object(self.submitter, 'sub_config', {READY_FOR_SUBMISSION_TO_EVA: True}): - - self.submitter.upload_submission(submission_upload_url=test_url) + self.submitter.sub_config[SUB_CLI_CONFIG_KEY_SUBMISSION_UPLOAD_URL] = test_url + self.submitter._upload_submission() for vcf_file in self.submitter.vcf_files: mock_upload_file.assert_any_call(test_url, vcf_file) mock_upload_file.assert_called_with(test_url, self.submitter.metadata_file) @@ -113,7 +113,7 @@ def test_upload_submission(self): def test_upload_file(self): test_url = 'http://example.com/' with patch('eva_sub_cli.submit.requests.put') as mock_put: - file_to_upload = os.path.join(self.resource_dir, 'EVA_Submission_template.V1.1.4.xlsx') - self.submitter.upload_file(submission_upload_url=test_url, input_file=file_to_upload) + file_to_upload = os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx') + self.submitter._upload_file(submission_upload_url=test_url, input_file=file_to_upload) assert mock_put.mock_calls[0][1][0] == test_url + os.path.basename(file_to_upload) # Cannot test the content of the upload as opening the same file twice give different object diff --git a/tests/test_xlsx2json.py b/tests/test_xlsx2json.py index b987744..e1bc6e0 100644 --- a/tests/test_xlsx2json.py +++ b/tests/test_xlsx2json.py @@ -15,7 +15,7 @@ class TestXlsReader(TestCase): biosample_schema = os.path.abspath(os.path.join(__file__, "../../eva_sub_cli/etc/eva-biosamples.json", )) def test_conversion_2_json(self) -> None: - xls_filename = os.path.join(self.resource_dir, 'EVA_Submission_template.V1.1.4.xlsx') + xls_filename = os.path.join(self.resource_dir, 'EVA_Submission_test.xlsx') self.parser = XlsxParser(xls_filename, self.conf_filename) output_json = os.path.join(self.resource_dir, 'EVA_Submission_template.V1.1.4.json') self.parser.json(output_json)