Skip to content

Commit

Permalink
Reset create_ena_projects branch on main.
Browse files Browse the repository at this point in the history
  • Loading branch information
anna-parker committed Jul 23, 2024
1 parent 6fd7e65 commit b464762
Show file tree
Hide file tree
Showing 20 changed files with 1,201 additions and 93 deletions.
37 changes: 37 additions & 0 deletions .github/workflows/ena-submission-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: ena-submission-tests
on:
# test
pull_request:
paths:
- "ena-submission/**"
- ".github/workflows/ena-submission-tests.yml"
push:
branches:
- main
workflow_dispatch:
concurrency:
group: ci-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}-ena-submission-tests
cancel-in-progress: true
jobs:
unitTests:
name: Unit Tests
runs-on: codebuild-loculus-ci-${{ github.run_id }}-${{ github.run_attempt }}
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
- name: Set up micromamba
uses: mamba-org/setup-micromamba@v1
with:
environment-file: ena-submission/environment.yml
micromamba-version: 'latest'
init-shell: >-
bash
powershell
cache-environment: true
post-cleanup: 'all'
- name: Run tests
run: |
micromamba activate loculus-ena-submission
python3 scripts/test_ena_submission.py
shell: micromamba-shell {0}
working-directory: ena-submission
3 changes: 2 additions & 1 deletion ena-submission/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.snakemake/
results/
results/
__pycache__
2 changes: 1 addition & 1 deletion ena-submission/ENA_submission.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ We require the following components:

- Analysis: An analysis contains secondary analysis results derived from sequence reads (e.g. a genome assembly).

At the time of writing (October 2023), in contrast to ENA, Pathoplexus has no hierarchy of study/sample/sequence: every sequence is its own study and sample. Therefore we need to figure out how to map sequences to projects, each submitter could have exactly _one_ study pre organism (this is the approach we are currently taking), or each sequence could be associated with its own study.
At the time of writing (October 2023), in contrast to ENA, Pathoplexus has no hierarchy of study/sample/sequence: every sequence is its own study and sample. Thus, each sequence will have to be submitted to ENA as its own study and sample. Alternatively, each submitter could have exactly _one_ study pre organism (this is the approach we are currently taking).

### Mapping sequences and studies

Expand Down
33 changes: 33 additions & 0 deletions ena-submission/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ with open("results/config.yaml", "w") as f:
LOG_LEVEL = config.get("log_level", "INFO")
ORGANISMS = config['organisms'].keys()


rule submit_all_external_metadata:
input:
expand("results/submitted_{organism}.json", organism=ORGANISMS)
Expand Down Expand Up @@ -63,6 +64,38 @@ rule get_ena_submission_list:
--log-level {params.log_level} \
"""

rule trigger_submission_to_ena:
input:
script="scripts/trigger_submission_to_ena.py",
input_file="results/approved_ena_submission_list.json",
config="results/config.yaml",
output:
submitted=touch("results/submitted"),
params:
log_level=LOG_LEVEL,
shell:
"""
python {input.script} \
--config-file {input.config} \
--input-file {input.input_file} \
--log-level {params.log_level} \
"""

rule create_project:
input:
script="scripts/create_project.py",
config="results/config.yaml",
output:
project_created=touch("results/project_created"),
params:
log_level=LOG_LEVEL,
shell:
"""
python {input.script} \
--config-file {input.config} \
--log-level {params.log_level} \
"""

rule get_ena_submission_list_and_sleep:
input:
file="results/ena_submission_list.json"
Expand Down
2 changes: 2 additions & 0 deletions ena-submission/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ organisms:
- M
- S
taxon_id: 3052518
scientific_name: "Orthonairovirus haemorrhagiae"
organismName: "Crimean-Congo Hemorrhagic Fever Virus"
externalMetadata:
- externalMetadataUpdater: ena
Expand Down Expand Up @@ -87,6 +88,7 @@ organisms:
ebola-sudan:
ingest:
taxon_id: 3052460
scientific_name: "Orthoebolavirus sudanense"
organismName: "Ebola Sudan"
externalMetadata:
- externalMetadataUpdater: ena
Expand Down
6 changes: 5 additions & 1 deletion ena-submission/config/defaults.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
username: external_metadata_updater
password: external_metadata_updater
keycloak_client_id: backend-client
ingest_pipeline_submitter: insdc_ingest_user
db_name: Loculus
unique_project_suffix: Loculus
ena_submission_username: fake-user
ena_submission_password: fake-password
ena_submission_url: https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit
1 change: 1 addition & 0 deletions ena-submission/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ dependencies:
- snakemake
- unzip
- psycopg2
- xmltodict
10 changes: 6 additions & 4 deletions ena-submission/flyway/sql/V1__Initial_Schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,28 @@ CREATE TABLE submission_table (
accession text not null,
version bigint not null,
organism text not null,
groupId bigint not null,
group_id bigint not null,
errors jsonb,
warnings jsonb,
status_all text not null,
started_at timestamp not null,
finished_at timestamp,
metadata jsonb,
aligned_nucleotide_sequences jsonb,
external_metadata jsonb,
primary key (accession, version)
);

CREATE TABLE project_table (
groupId bigint not null,
group_id bigint not null,
organism text not null,
errors jsonb,
warnings jsonb,
status text not null,
started_at timestamp not null,
finished_at timestamp,
project_metadata jsonb,
primary key (groupId, organism)
result jsonb,
primary key (group_id, organism)
);

CREATE TABLE sample_table (
Expand Down
Binary file not shown.
26 changes: 26 additions & 0 deletions ena-submission/scripts/call_loculus.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,32 @@ def submit_external_metadata(
return response


def get_group_info(config: Config, group_id: int) -> dict[str, Any]:
"""Get group info given id"""

# TODO: only get a list of released accessionVersions and compare with submission DB.
url = f"{backend_url(config)}/groups/{group_id}"

headers = {"Content-Type": "application/json"}

response = make_request(HTTPMethod.GET, url, config, headers=headers)
if not response.ok:
logger.error(response.json())
response.raise_for_status()

entries: list[dict[str, Any]] = []
try:
entries = list(jsonlines.Reader(response.iter_lines()).iter())
except jsonlines.Error as err:
response_summary = response.text
if len(response_summary) > 100:
response_summary = response_summary[:50] + "\n[..]\n" + response_summary[-50:]
logger.error(f"Error decoding JSON from /groups/{group_id}: {response_summary}")
raise ValueError() from err

return entries


def get_released_data(config: Config, organism: str) -> dict[str, Any]:
"""Get sequences that are ready for release"""

Expand Down
Loading

0 comments on commit b464762

Please sign in to comment.