Skip to content

Commit

Permalink
Add correct genome annotations from NCBI
Browse files Browse the repository at this point in the history
  • Loading branch information
anna-parker committed May 15, 2024
1 parent f267bcc commit 547c459
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 79 deletions.
30 changes: 13 additions & 17 deletions preprocessing/nextclade/src/loculus_preprocessing/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,21 @@ class Config:
nextclade_dataset_server: str = "https://data.clades.nextstrain.org/v3"
config_file: str | None = None
log_level: str = "DEBUG"
genes: list[str] = dataclasses.field(
default_factory=lambda: ['RdRp', 'GPC', 'NP'])
nucleotideSequences: list[str] = dataclasses.field(
default_factory=lambda: ['M', 'S', 'L'])
genes: list[str] = dataclasses.field(default_factory=lambda: ["RdRp", "GPC", "NP"])
nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["M", "S", "L"])
keep_tmp_dir: bool = True
reference_length: int = 197209
batch_size: int = 5
processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=lambda: {'collection_date': {'function': 'process_date',
'inputs':
{'date': 'collection_date',
'release_date': 'ncbi_release_date'
},
'required': 'true'},
'country':
{'function': 'identity',
'inputs':
{'input': 'country'},
'required': 'true'}})
processing_spec: dict[str, dict[str, Any]] = dataclasses.field(
default_factory=lambda: {
"collection_date": {
"function": "process_date",
"inputs": {"date": "collection_date", "release_date": "ncbi_release_date"},
"required": "true",
},
"country": {"function": "identity", "inputs": {"input": "country"}, "required": "true"},
}
)
pipeline_version: int = 1


Expand Down Expand Up @@ -72,8 +69,7 @@ def kebab(s: str) -> str:


def generate_argparse_from_dataclass(config_cls: type[Config]) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Command-line arguments for Config class")
parser = argparse.ArgumentParser(description="Command-line arguments for Config class")
for field in dataclasses.fields(config_cls):
field_name = kebab(field.name)
field_type = base_type(field.type)
Expand Down
112 changes: 54 additions & 58 deletions preprocessing/nextclade/src/loculus_preprocessing/prepro.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import dataclasses
import json
import logging
import os
import subprocess # noqa: S404
import sys
import time
Expand All @@ -10,12 +11,11 @@
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Optional
from distutils.dir_util import copy_tree
import os

import dpath
import requests
from Bio import SeqIO
from distutils.dir_util import copy_tree

from .backend import get_jwt
from .config import Config
Expand Down Expand Up @@ -47,8 +47,7 @@
def fetch_unprocessed_sequences(n: int, config: Config) -> Sequence[UnprocessedEntry]:
url = config.backend_host.rstrip("/") + "/extract-unprocessed-data"
logging.debug(f"Fetching {n} unprocessed sequences from {url}")
params = {"numberOfSequenceEntries": n,
"pipelineVersion": config.pipeline_version}
params = {"numberOfSequenceEntries": n, "pipelineVersion": config.pipeline_version}
headers = {"Authorization": "Bearer " + get_jwt(config)}
response = requests.post(url, data=params, headers=headers, timeout=10)
if not response.ok:
Expand Down Expand Up @@ -87,13 +86,16 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]:
def enrich_with_nextclade(
unprocessed: Sequence[UnprocessedEntry], dataset_dir: str, config: Config
) -> dict[AccessionVersion, UnprocessedAfterNextclade]:
unaligned_nucleotide_sequences: dict[AccessionVersion,
dict[str, Optional[NucleotideSequence]]] = {}
unaligned_nucleotide_sequences: dict[
AccessionVersion, dict[str, Optional[NucleotideSequence]]
] = {}
input_metadata: dict[AccessionVersion, dict[str, Any]] = {}
aligned_aminoacid_sequences: dict[AccessionVersion,
dict[GeneName, AminoAcidSequence | None]] = {}
aligned_nucleotide_sequences: dict[AccessionVersion,
dict[str, Optional[NucleotideSequence]]] = {}
aligned_aminoacid_sequences: dict[
AccessionVersion, dict[GeneName, AminoAcidSequence | None]
] = {}
aligned_nucleotide_sequences: dict[
AccessionVersion, dict[str, Optional[NucleotideSequence]]
] = {}
for entry in unprocessed:
id = entry.accessionVersion
input_metadata[id] = entry.data.metadata
Expand All @@ -105,32 +107,32 @@ def enrich_with_nextclade(
for segment in config.nucleotideSequences:
aligned_nucleotide_sequences[id][segment] = None
if segment in entry.data.unalignedNucleotideSequences:
unaligned_nucleotide_sequences[id][segment] = entry.data.unalignedNucleotideSequences[segment]
unaligned_nucleotide_sequences[id][segment] = (
entry.data.unalignedNucleotideSequences[segment]
)
else:
unaligned_nucleotide_sequences[id][segment] = None

nextclade_metadata: dict[AccessionVersion, dict[str, Any]] = {}
nucleotide_insertions: dict[AccessionVersion,
dict[str, list[NucleotideInsertion]]] = {}
amino_acid_insertions: dict[AccessionVersion,
dict[GeneName, list[AminoAcidInsertion]]] = {}
with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir:
nucleotide_insertions: dict[AccessionVersion, dict[str, list[NucleotideInsertion]]] = {}
amino_acid_insertions: dict[AccessionVersion, dict[GeneName, list[AminoAcidInsertion]]] = {}
with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir: # noqa: PLR1702
for segment in config.nucleotideSequences:
result_dir_seg = result_dir + "/" + segment
dataset_dir_seg = dataset_dir + "/" + segment
input_file = result_dir_seg + "/input.fasta"
os.makedirs(os.path.dirname(input_file), exist_ok=True)
with open(input_file, "w", encoding="utf-8") as f:
for id, seg_dict in unaligned_nucleotide_sequences.items():
if segment in seg_dict.keys():
if segment in seg_dict:
f.write(f">{id}\n")
f.write(f"{seg_dict[segment]}\n")

command = [
"nextclade3",
"run",
f"--output-all={result_dir}",
f"--input-dataset={dataset_dir}",
f"--output-all={result_dir_seg}",
f"--input-dataset={dataset_dir_seg}",
f"--output-translations={
result_dir}/nextclade.cds_translation.{{cds}}.fasta",
"--jobs=1",
Expand All @@ -145,26 +147,26 @@ def enrich_with_nextclade(
msg = f"nextclade failed with exit code {exit_code}"
raise Exception(msg)

logging.debug(f"Nextclade results available in {result_dir}")
logging.debug("Nextclade results available in %s", result_dir)

with open(result_dir_seg + "/nextclade.aligned.fasta", encoding="utf-8") as aligned_nucs:
with open(
result_dir_seg + "/nextclade.aligned.fasta", encoding="utf-8"
) as aligned_nucs:
aligned_nuc = SeqIO.parse(aligned_nucs, "fasta")
for aligned_sequence in aligned_nuc:
sequence_id: str = aligned_sequence.id
aligned_nucleotide_sequences[sequence_id][segment] = str(
aligned_sequence.seq)
aligned_nucleotide_sequences[sequence_id][segment] = str(aligned_sequence.seq)

for gene in config.genes:
translation_path = result_dir_seg + \
f"/nextclade.cds_translation.{gene}.fasta"
translation_path = result_dir_seg + f"/nextclade.cds_translation.{gene}.fasta"
try:
with open(translation_path, encoding="utf-8") as aligned_translations:
aligned_translation = SeqIO.parse(
aligned_translations, "fasta")
aligned_translation = SeqIO.parse(aligned_translations, "fasta")
for aligned_sequence in aligned_translation:
sequence_id = aligned_sequence.id
aligned_aminoacid_sequences[sequence_id][gene] = str(
aligned_sequence.seq)
aligned_sequence.seq
)
except FileNotFoundError:
# TODO: Add warning to each sequence
logging.info(
Expand All @@ -174,7 +176,8 @@ def enrich_with_nextclade(

parse_nextclade_json(result_dir_seg, nextclade_metadata)
parse_nextclade_tsv(
amino_acid_insertions, nucleotide_insertions, result_dir_seg, config, segment)
amino_acid_insertions, nucleotide_insertions, result_dir_seg, config, segment
)

return {
id: UnprocessedAfterNextclade(
Expand All @@ -190,18 +193,20 @@ def enrich_with_nextclade(
}


def parse_nextclade_tsv(amino_acid_insertions: dict[AccessionVersion, dict[GeneName, list[AminoAcidInsertion]]],
nucleotide_insertions: dict[AccessionVersion, dict[str, list[NucleotideInsertion]]],
result_dir: str, config: Config, segment: str
):
def parse_nextclade_tsv(
amino_acid_insertions: dict[AccessionVersion, dict[GeneName, list[AminoAcidInsertion]]],
nucleotide_insertions: dict[AccessionVersion, dict[str, list[NucleotideInsertion]]],
result_dir: str,
config: Config,
segment: str,
):
with open(result_dir + "/nextclade.tsv", encoding="utf-8") as nextclade_tsv:
reader = csv.DictReader(nextclade_tsv, delimiter="\t")
for row in reader:
id = row["seqName"]

nuc_ins_str = list(row["insertions"].split(","))
nucleotide_insertions[id] = {
segment: [] if nuc_ins_str == [""] else nuc_ins_str}
nucleotide_insertions[id] = {segment: [] if nuc_ins_str == [""] else nuc_ins_str}

aa_ins: dict[str, list[str]] = {gene: [] for gene in config.genes}
aa_ins_split = row["aaInsertions"].split(",")
Expand Down Expand Up @@ -254,11 +259,11 @@ def process_single(
warnings: list[ProcessingAnnotation] = []
len_dict: dict[str, str | int | float | None] = {}
for segment in config.nucleotideSequences:
if unprocessed.unalignedNucleotideSequences[segment] is None:
len_dict["length_" + segment] = None
sequence = unprocessed.unalignedNucleotideSequences[segment]
if sequence:
len_dict["length_" + segment] = len(sequence)
else:
len_dict["length_" +
segment] = len(unprocessed.unalignedNucleotideSequences[segment])
len_dict["length_" + segment] = None
output_metadata: ProcessedMetadata = len_dict

for output_field, spec_dict in config.processing_spec.items():
Expand Down Expand Up @@ -288,7 +293,7 @@ def process_single(
)
)
continue
sub_path = input_path[len(nextclade_prefix):]
sub_path = input_path[len(nextclade_prefix) :]
input_data[arg_name] = str(
dpath.get(
unprocessed.nextcladeMetadata,
Expand All @@ -302,8 +307,7 @@ def process_single(
warnings.append(
ProcessingAnnotation(
source=[
AnnotationSource(
name=input_path, type=AnnotationSourceType.METADATA)
AnnotationSource(name=input_path, type=AnnotationSourceType.METADATA)
],
message=f"Metadata field '{
input_path}' not found in input",
Expand Down Expand Up @@ -355,8 +359,7 @@ def process_all(


def submit_processed_sequences(processed: Sequence[ProcessedEntry], config: Config) -> None:
json_strings = [json.dumps(dataclasses.asdict(sequence))
for sequence in processed]
json_strings = [json.dumps(dataclasses.asdict(sequence)) for sequence in processed]
with open("data.json", "w", encoding="utf-8") as f:
for seq in processed:
json.dump(dataclasses.asdict(seq), f)
Expand All @@ -367,11 +370,9 @@ def submit_processed_sequences(processed: Sequence[ProcessedEntry], config: Conf
"Authorization": "Bearer " + get_jwt(config),
}
params = {"pipelineVersion": config.pipeline_version}
response = requests.post(url, data=ndjson_string,
headers=headers, params=params, timeout=10)
response = requests.post(url, data=ndjson_string, headers=headers, params=params, timeout=10)
if not response.ok:
Path("failed_submission.json").write_text(
ndjson_string, encoding="utf-8")
Path("failed_submission.json").write_text(ndjson_string, encoding="utf-8")
msg = (
f"Submitting processed data failed. Status code: {
response.status_code}\n"
Expand All @@ -394,11 +395,9 @@ def download_nextclade_dataset(dataset_dir: str, config: Config) -> None:
]

if config.nextclade_dataset_tag is not None:
dataset_download_command.append(
f"--tag={config.nextclade_dataset_tag}")
dataset_download_command.append(f"--tag={config.nextclade_dataset_tag}")

logging.info("Downloading Nextclade dataset: %s",
dataset_download_command)
logging.info("Downloading Nextclade dataset: %s", dataset_download_command)
if subprocess.run(dataset_download_command, check=False).returncode != 0: # noqa: S603
msg = "Dataset download failed"
raise RuntimeError(msg)
Expand All @@ -413,12 +412,10 @@ def run(config: Config) -> None:
total_processed = 0
while True:
logging.debug("Fetching unprocessed sequences")
unprocessed = fetch_unprocessed_sequences(
config.batch_size, config)
unprocessed = fetch_unprocessed_sequences(config.batch_size, config)
if len(unprocessed) == 0:
# sleep 1 sec and try again
logging.debug(
"No unprocessed sequences found. Sleeping for 1 second.")
logging.debug("No unprocessed sequences found. Sleeping for 1 second.")
time.sleep(1)
continue
# Process the sequences, get result as dictionary
Expand All @@ -427,8 +424,7 @@ def run(config: Config) -> None:
try:
submit_processed_sequences(processed, config)
except RuntimeError as e:
logging.exception(
"Submitting processed data failed. Traceback : %s", e)
logging.exception("Submitting processed data failed. Traceback : %s", e)
continue
total_processed += len(processed)
logging.info("Processed %s sequences", len(processed))
8 changes: 7 additions & 1 deletion preprocessing/nextstrain/cchf/L/genome_annotation.gff3
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
##gff-version 3
#!gff-spec-version 1.21
#!processor NCBI annotwriter
##sequence-region NC_005301.3 1 12108
NC_005301.3 feature gene 77 11914 . + . gene_name="RdRp"
##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=3052518
NC_005301.3 RefSeq region 1 12108 . + . ID=NC_005301.3:1..12108;Dbxref=taxon:3052518;Name=L;gbkey=Src;genome=genomic;mol_type=genomic RNA;old-name=Crimean-Congo hemorrhagic fever virus;segment=L;strain=IbAr10200
NC_005301.3 RefSeq gene 77 11914 . + . gene_name="RdRp";ID=gene-CCFHV_sLgp1;Dbxref=GeneID:2943075;Name=RdRp;gbkey=Gene;gene=RdRp;gene_biotype=protein_coding;locus_tag=CCFHV_sLgp1
NC_005301.3 RefSeq CDS 77 11914 . + 0 ID=cds-YP_325663.1;Parent=gene-CCFHV_sLgp1;Dbxref=GenBank:YP_325663.1,GeneID:2943075;Name=YP_325663.1;Note=putative polymerase and helicase function%3B putative RNA-dependent RNA polymerase%3B contains OTU-like cysteine protease motif;gbkey=CDS;gene=RdRp;locus_tag=CCFHV_sLgp1;product=putative polyprotein;protein_id=YP_325663.1

9 changes: 8 additions & 1 deletion preprocessing/nextstrain/cchf/M/genome_annotation.gff3
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
##gff-version 3
#!gff-spec-version 1.21
#!processor NCBI annotwriter
##sequence-region NC_005300.2 1 5366
NC_005300.2 feature gene 93 5147 . + . gene_name="GPC"
##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=3052518
NC_005300.2 RefSeq region 1 5366 . + . ID=NC_005300.2:1..5366;Dbxref=taxon:3052518;Name=M;country=Nigeria: Sokoto;gbkey=Src;genome=genomic;mol_type=genomic RNA;nat-host=Hyalomma excavatum;note=from YARU%2C New Haven%2C Conn. SMB passed 13 time in suckling mouse brain followed by 2 passes in CDC Vero E6 cells;old-name=Crimean-Congo hemorrhagic fever virus;segment=M;strain=IbAr10200
NC_005300.2 RefSeq primer_binding_site 1 18 . + . ID=id-NC_005300.2:1..18;gbkey=primer_bind
NC_005300.2 RefSeq gene 93 5147 . + . gene_name="GPC";ID=gene-CCFHV_sMgp1;Dbxref=GeneID:2943074;Name=CCFHV_sMgp1;gbkey=Gene;gene_biotype=protein_coding;locus_tag=CCFHV_sMgp1
NC_005300.2 RefSeq CDS 93 5147 . + 0 ID=cds-NP_950235.1;Parent=gene-CCFHV_sMgp1;Dbxref=GenBank:NP_950235.1,GeneID:2943074;Name=NP_950235.1;gbkey=CDS;locus_tag=CCFHV_sMgp1;product=glycoprotein precursor;protein_id=NP_950235.1
NC_005300.2 RefSeq primer_binding_site 5349 5366 . - . ID=id-NC_005300.2:5349..5366;gbkey=primer_bind
9 changes: 7 additions & 2 deletions preprocessing/nextstrain/cchf/S/genome_annotation.gff3
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
##gff-version 3
##sequence-region NC_005302.1 1 1672
NC_005302.1 feature gene 56 1504 . + . gene_name="NP"
#!gff-spec-version 1.21
#!processor NCBI annotwriter
##sequence-region NC_005302.1 1 1672
##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=3052518
NC_005302.1 RefSeq region 1 1672 . + . ID=NC_005302.1:1..1672;Dbxref=taxon:3052518;Name=S;gbkey=Src;genome=genomic;mol_type=genomic RNA;old-name=Crimean-Congo hemorrhagic fever virus;segment=S;strain=10200
NC_005302.1 RefSeq gene 56 1504 . + . gene_name="NP";ID=gene-CCFHVsSgp1;Dbxref=GeneID:2943076;Name=CCFHVsSgp1;gbkey=Gene;gene_biotype=protein_coding;locus_tag=CCFHVsSgp1
NC_005302.1 RefSeq CDS 56 1504 . + 0 ID=cds-NP_950237.1;Parent=gene-CCFHVsSgp1;Dbxref=GenBank:NP_950237.1,GeneID:2943076;Name=NP_950237.1;gbkey=CDS;locus_tag=CCFHVsSgp1;product=nucleoprotein;protein_id=NP_950237.1

0 comments on commit 547c459

Please sign in to comment.