Skip to content

Commit

Permalink
feat(ena-submission): Trigger ena submission (#2412)
Browse files Browse the repository at this point in the history
* change get_ena_submission_list to a daily cronjob (also add the organism name to the json output)

* add constantly running trigger_submission_to_ena rule which gets approved sequences from a github repo and adds them to the submission_table.

* Modify the V1__initial_schema.sql to help with future submissions.
  • Loading branch information
anna-parker authored Aug 13, 2024
1 parent e76d3eb commit b68f2cd
Show file tree
Hide file tree
Showing 13 changed files with 405 additions and 107 deletions.
3 changes: 2 additions & 1 deletion ena-submission/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.snakemake/
results/
results/
__pycache__
31 changes: 26 additions & 5 deletions ena-submission/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,34 @@ rule get_ena_submission_list:
--log-level {params.log_level} \
"""

rule get_ena_submission_list_and_sleep:
rule trigger_submission_to_ena:
input:
file="results/ena_submission_list.json"
script="scripts/trigger_submission_to_ena.py",
config="results/config.yaml",
output:
submitted=touch("results/triggered"),
params:
log_level=LOG_LEVEL,
shell:
"""
python {input.script} \
--config-file {input.config} \
--log-level {params.log_level} \
"""

rule trigger_submission_to_ena_from_file: # for testing
input:
script="scripts/trigger_submission_to_ena.py",
input_file="results/approved_ena_submission_list.json",
config="results/config.yaml",
output:
file="results/sleep.txt"
submitted=touch("results/triggered_from_file"),
params:
log_level=LOG_LEVEL,
shell:
"""
sleep 360
touch {output.file}
python {input.script} \
--config-file {input.config} \
--input-file {input.input_file} \
--log-level {params.log_level} \
"""
3 changes: 3 additions & 0 deletions ena-submission/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ username: external_metadata_updater
password: external_metadata_updater
keycloak_client_id: backend-client
ingest_pipeline_submitter: insdc_ingest_user
github_username: fake_username
github_pat: fake_pat
github_url: https://api.github.com/repos/pathoplexus/ena-submission/contents/test/approved_ena_submission_list.json?ref=main
14 changes: 8 additions & 6 deletions ena-submission/flyway/sql/V1__Initial_Schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,28 @@ CREATE TABLE submission_table (
accession text not null,
version bigint not null,
organism text not null,
groupId bigint not null,
group_id bigint not null,
errors jsonb,
warnings jsonb,
status_all text not null,
started_at timestamp not null,
finished_at timestamp,
metadata jsonb,
unaligned_nucleotide_sequences jsonb,
external_metadata jsonb,
primary key (accession, version)
);

CREATE TABLE project_table (
groupId bigint not null,
group_id bigint not null,
organism text not null,
errors jsonb,
warnings jsonb,
status text not null,
started_at timestamp not null,
finished_at timestamp,
project_metadata jsonb,
primary key (groupId, organism)
result jsonb,
primary key (group_id, organism)
);

CREATE TABLE sample_table (
Expand All @@ -32,7 +34,7 @@ CREATE TABLE sample_table (
status text not null,
started_at timestamp not null,
finished_at timestamp,
sample_metadata jsonb,
result jsonb,
primary key (accession, version)
);

Expand All @@ -44,6 +46,6 @@ CREATE TABLE assembly_table (
status text not null,
started_at timestamp not null,
finished_at timestamp,
assembly_metadata jsonb,
result jsonb,
primary key (accession, version)
);
Binary file not shown.
Binary file not shown.
21 changes: 11 additions & 10 deletions ena-submission/scripts/get_ena_submission_list.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import json
import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List
from typing import Any

import click
import yaml
from call_loculus import get_released_data
from notifications import get_slack_config, notify, upload_file_with_comment
from submission_db import get_db_config, in_submission_table
from submission_db_helper import get_db_config, in_submission_table

logger = logging.getLogger(__name__)
logging.basicConfig(
Expand All @@ -22,14 +21,14 @@

@dataclass
class Config:
organisms: List[Dict[str, str]]
organisms: list[dict[str, str]]
organism: str
backend_url: str
keycloak_token_url: str
keycloak_client_id: str
username: str
password: str
ena_specific_metadata: List[str]
ena_specific_metadata: list[str]
ingest_pipeline_submitter: str
db_username: str
db_password: str
Expand All @@ -39,7 +38,7 @@ class Config:
slack_channel_id: str


def get_data_for_submission(config, entries, db_config):
def get_data_for_submission(config, entries, db_config, organism):
"""
Filter data in state APPROVED_FOR_RELEASE:
- data must be state "OPEN" for use
Expand All @@ -66,6 +65,7 @@ def get_data_for_submission(config, entries, db_config):
f"or {config.ingest_pipeline_submitter}. Potential user error: discarding sequence.",
)
continue
item["organism"] = organism
data_dict[key] = item
return data_dict

Expand Down Expand Up @@ -115,7 +115,7 @@ def get_ena_submission_list(log_level, config_file, output_file):
logger.setLevel(log_level)
logging.getLogger("requests").setLevel(logging.WARNING)

with open(config_file) as file:
with open(config_file, encoding="utf-8") as file:
full_config = yaml.safe_load(file)
relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__}
config = Config(**relevant_config)
Expand All @@ -135,14 +135,15 @@ def get_ena_submission_list(log_level, config_file, output_file):
logging.info(f"Getting released sequences for organism: {organism}")

all_entries = get_released_data(config, organism)
entries_to_submit.update(get_data_for_submission(config, all_entries, db_config))
data = get_data_for_submission(config, all_entries, db_config, organism)
entries_to_submit.update(data)

if entries_to_submit:
Path(output_file).write_text(json.dumps(entries_to_submit))
Path(output_file).write_text(json.dumps(entries_to_submit), encoding="utf-8")
send_slack_notification(config, output_file)
else:
logging.info("No sequences found to submit to ENA")
Path(output_file).write_text("")
Path(output_file).write_text("", encoding="utf-8")


if __name__ == "__main__":
Expand Down
84 changes: 0 additions & 84 deletions ena-submission/scripts/submission_db.py

This file was deleted.

Loading

0 comments on commit b68f2cd

Please sign in to comment.