Add correct genome annotations from NCBI

loculus-project · May 15, 2024 · 547c459 · 547c459
1 parent f267bcc
commit 547c459
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 79 deletions.
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -27,24 +27,21 @@ class Config:
     nextclade_dataset_server: str = "https://data.clades.nextstrain.org/v3"
     config_file: str | None = None
     log_level: str = "DEBUG"
-    genes: list[str] = dataclasses.field(
-        default_factory=lambda: ['RdRp', 'GPC', 'NP'])
-    nucleotideSequences: list[str] = dataclasses.field(
-        default_factory=lambda: ['M', 'S', 'L'])
+    genes: list[str] = dataclasses.field(default_factory=lambda: ["RdRp", "GPC", "NP"])
+    nucleotideSequences: list[str] = dataclasses.field(default_factory=lambda: ["M", "S", "L"])
     keep_tmp_dir: bool = True
     reference_length: int = 197209
     batch_size: int = 5
-    processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=lambda: {'collection_date': {'function': 'process_date',
-                                                                                                                'inputs':
-                                                                                                                {'date': 'collection_date',
-                                                                                                                 'release_date': 'ncbi_release_date'
-                                                                                                                 },
-                                                                                                                'required': 'true'},
-                                                                                            'country':
-                                                                                            {'function': 'identity',
-                                                                                             'inputs':
-                                                                                             {'input': 'country'},
-                                                                                             'required': 'true'}})
+    processing_spec: dict[str, dict[str, Any]] = dataclasses.field(
+        default_factory=lambda: {
+            "collection_date": {
+                "function": "process_date",
+                "inputs": {"date": "collection_date", "release_date": "ncbi_release_date"},
+                "required": "true",
+            },
+            "country": {"function": "identity", "inputs": {"input": "country"}, "required": "true"},
+        }
+    )
     pipeline_version: int = 1
 
 
@@ -72,8 +69,7 @@ def kebab(s: str) -> str:
 
 
 def generate_argparse_from_dataclass(config_cls: type[Config]) -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(
-        description="Command-line arguments for Config class")
+    parser = argparse.ArgumentParser(description="Command-line arguments for Config class")
     for field in dataclasses.fields(config_cls):
         field_name = kebab(field.name)
         field_type = base_type(field.type)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -2,6 +2,7 @@
 import dataclasses
 import json
 import logging
+import os
 import subprocess  # noqa: S404
 import sys
 import time
@@ -10,12 +11,11 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Optional
-from distutils.dir_util import copy_tree
-import os
 
 import dpath
 import requests
 from Bio import SeqIO
+from distutils.dir_util import copy_tree
 
 from .backend import get_jwt
 from .config import Config
@@ -47,8 +47,7 @@
 def fetch_unprocessed_sequences(n: int, config: Config) -> Sequence[UnprocessedEntry]:
     url = config.backend_host.rstrip("/") + "/extract-unprocessed-data"
     logging.debug(f"Fetching {n} unprocessed sequences from {url}")
-    params = {"numberOfSequenceEntries": n,
-              "pipelineVersion": config.pipeline_version}
+    params = {"numberOfSequenceEntries": n, "pipelineVersion": config.pipeline_version}
     headers = {"Authorization": "Bearer " + get_jwt(config)}
     response = requests.post(url, data=params, headers=headers, timeout=10)
     if not response.ok:
@@ -87,13 +86,16 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]:
 def enrich_with_nextclade(
     unprocessed: Sequence[UnprocessedEntry], dataset_dir: str, config: Config
 ) -> dict[AccessionVersion, UnprocessedAfterNextclade]:
-    unaligned_nucleotide_sequences: dict[AccessionVersion,
-                                         dict[str, Optional[NucleotideSequence]]] = {}
+    unaligned_nucleotide_sequences: dict[
+        AccessionVersion, dict[str, Optional[NucleotideSequence]]
+    ] = {}
     input_metadata: dict[AccessionVersion, dict[str, Any]] = {}
-    aligned_aminoacid_sequences: dict[AccessionVersion,
-                                      dict[GeneName, AminoAcidSequence | None]] = {}
-    aligned_nucleotide_sequences: dict[AccessionVersion,
-                                       dict[str, Optional[NucleotideSequence]]] = {}
+    aligned_aminoacid_sequences: dict[
+        AccessionVersion, dict[GeneName, AminoAcidSequence | None]
+    ] = {}
+    aligned_nucleotide_sequences: dict[
+        AccessionVersion, dict[str, Optional[NucleotideSequence]]
+    ] = {}
     for entry in unprocessed:
         id = entry.accessionVersion
         input_metadata[id] = entry.data.metadata
@@ -105,32 +107,32 @@ def enrich_with_nextclade(
         for segment in config.nucleotideSequences:
             aligned_nucleotide_sequences[id][segment] = None
             if segment in entry.data.unalignedNucleotideSequences:
-                unaligned_nucleotide_sequences[id][segment] = entry.data.unalignedNucleotideSequences[segment]
+                unaligned_nucleotide_sequences[id][segment] = (
+                    entry.data.unalignedNucleotideSequences[segment]
+                )
             else:
                 unaligned_nucleotide_sequences[id][segment] = None
 
     nextclade_metadata: dict[AccessionVersion, dict[str, Any]] = {}
-    nucleotide_insertions: dict[AccessionVersion,
-                                dict[str, list[NucleotideInsertion]]] = {}
-    amino_acid_insertions: dict[AccessionVersion,
-                                dict[GeneName, list[AminoAcidInsertion]]] = {}
-    with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir:
+    nucleotide_insertions: dict[AccessionVersion, dict[str, list[NucleotideInsertion]]] = {}
+    amino_acid_insertions: dict[AccessionVersion, dict[GeneName, list[AminoAcidInsertion]]] = {}
+    with TemporaryDirectory(delete=not config.keep_tmp_dir) as result_dir:  # noqa: PLR1702
         for segment in config.nucleotideSequences:
             result_dir_seg = result_dir + "/" + segment
             dataset_dir_seg = dataset_dir + "/" + segment
             input_file = result_dir_seg + "/input.fasta"
             os.makedirs(os.path.dirname(input_file), exist_ok=True)
             with open(input_file, "w", encoding="utf-8") as f:
                 for id, seg_dict in unaligned_nucleotide_sequences.items():
-                    if segment in seg_dict.keys():
+                    if segment in seg_dict:
                         f.write(f">{id}\n")
                         f.write(f"{seg_dict[segment]}\n")
 
             command = [
                 "nextclade3",
                 "run",
-                f"--output-all={result_dir}",
-                f"--input-dataset={dataset_dir}",
+                f"--output-all={result_dir_seg}",
+                f"--input-dataset={dataset_dir_seg}",
                 f"--output-translations={
                     result_dir}/nextclade.cds_translation.{{cds}}.fasta",
                 "--jobs=1",
@@ -145,26 +147,26 @@ def enrich_with_nextclade(
                 msg = f"nextclade failed with exit code {exit_code}"
                 raise Exception(msg)
 
-            logging.debug(f"Nextclade results available in {result_dir}")
+            logging.debug("Nextclade results available in %s", result_dir)
 
-            with open(result_dir_seg + "/nextclade.aligned.fasta", encoding="utf-8") as aligned_nucs:
+            with open(
+                result_dir_seg + "/nextclade.aligned.fasta", encoding="utf-8"
+            ) as aligned_nucs:
                 aligned_nuc = SeqIO.parse(aligned_nucs, "fasta")
                 for aligned_sequence in aligned_nuc:
                     sequence_id: str = aligned_sequence.id
-                    aligned_nucleotide_sequences[sequence_id][segment] = str(
-                        aligned_sequence.seq)
+                    aligned_nucleotide_sequences[sequence_id][segment] = str(aligned_sequence.seq)
 
             for gene in config.genes:
-                translation_path = result_dir_seg + \
-                    f"/nextclade.cds_translation.{gene}.fasta"
+                translation_path = result_dir_seg + f"/nextclade.cds_translation.{gene}.fasta"
                 try:
                     with open(translation_path, encoding="utf-8") as aligned_translations:
-                        aligned_translation = SeqIO.parse(
-                            aligned_translations, "fasta")
+                        aligned_translation = SeqIO.parse(aligned_translations, "fasta")
                         for aligned_sequence in aligned_translation:
                             sequence_id = aligned_sequence.id
                             aligned_aminoacid_sequences[sequence_id][gene] = str(
-                                aligned_sequence.seq)
+                                aligned_sequence.seq
+                            )
                 except FileNotFoundError:
                     # TODO: Add warning to each sequence
                     logging.info(
@@ -174,7 +176,8 @@ def enrich_with_nextclade(
 
             parse_nextclade_json(result_dir_seg, nextclade_metadata)
             parse_nextclade_tsv(
-                amino_acid_insertions, nucleotide_insertions, result_dir_seg, config, segment)
+                amino_acid_insertions, nucleotide_insertions, result_dir_seg, config, segment
+            )
 
     return {
         id: UnprocessedAfterNextclade(
@@ -190,18 +193,20 @@ def enrich_with_nextclade(
     }
 
 
-def parse_nextclade_tsv(amino_acid_insertions: dict[AccessionVersion, dict[GeneName, list[AminoAcidInsertion]]],
-                        nucleotide_insertions: dict[AccessionVersion, dict[str, list[NucleotideInsertion]]],
-                        result_dir: str, config: Config, segment: str
-                        ):
+def parse_nextclade_tsv(
+    amino_acid_insertions: dict[AccessionVersion, dict[GeneName, list[AminoAcidInsertion]]],
+    nucleotide_insertions: dict[AccessionVersion, dict[str, list[NucleotideInsertion]]],
+    result_dir: str,
+    config: Config,
+    segment: str,
+):
     with open(result_dir + "/nextclade.tsv", encoding="utf-8") as nextclade_tsv:
         reader = csv.DictReader(nextclade_tsv, delimiter="\t")
         for row in reader:
             id = row["seqName"]
 
             nuc_ins_str = list(row["insertions"].split(","))
-            nucleotide_insertions[id] = {
-                segment: [] if nuc_ins_str == [""] else nuc_ins_str}
+            nucleotide_insertions[id] = {segment: [] if nuc_ins_str == [""] else nuc_ins_str}
 
             aa_ins: dict[str, list[str]] = {gene: [] for gene in config.genes}
             aa_ins_split = row["aaInsertions"].split(",")
@@ -254,11 +259,11 @@ def process_single(
     warnings: list[ProcessingAnnotation] = []
     len_dict: dict[str, str | int | float | None] = {}
     for segment in config.nucleotideSequences:
-        if unprocessed.unalignedNucleotideSequences[segment] is None:
-            len_dict["length_" + segment] = None
+        sequence = unprocessed.unalignedNucleotideSequences[segment]
+        if sequence:
+            len_dict["length_" + segment] = len(sequence)
         else:
-            len_dict["length_" +
-                     segment] = len(unprocessed.unalignedNucleotideSequences[segment])
+            len_dict["length_" + segment] = None
     output_metadata: ProcessedMetadata = len_dict
 
     for output_field, spec_dict in config.processing_spec.items():
@@ -288,7 +293,7 @@ def process_single(
                         )
                     )
                     continue
-                sub_path = input_path[len(nextclade_prefix):]
+                sub_path = input_path[len(nextclade_prefix) :]
                 input_data[arg_name] = str(
                     dpath.get(
                         unprocessed.nextcladeMetadata,
@@ -302,8 +307,7 @@ def process_single(
                 warnings.append(
                     ProcessingAnnotation(
                         source=[
-                            AnnotationSource(
-                                name=input_path, type=AnnotationSourceType.METADATA)
+                            AnnotationSource(name=input_path, type=AnnotationSourceType.METADATA)
                         ],
                         message=f"Metadata field '{
                             input_path}' not found in input",
@@ -355,8 +359,7 @@ def process_all(
 
 
 def submit_processed_sequences(processed: Sequence[ProcessedEntry], config: Config) -> None:
-    json_strings = [json.dumps(dataclasses.asdict(sequence))
-                    for sequence in processed]
+    json_strings = [json.dumps(dataclasses.asdict(sequence)) for sequence in processed]
     with open("data.json", "w", encoding="utf-8") as f:
         for seq in processed:
             json.dump(dataclasses.asdict(seq), f)
@@ -367,11 +370,9 @@ def submit_processed_sequences(processed: Sequence[ProcessedEntry], config: Conf
         "Authorization": "Bearer " + get_jwt(config),
     }
     params = {"pipelineVersion": config.pipeline_version}
-    response = requests.post(url, data=ndjson_string,
-                             headers=headers, params=params, timeout=10)
+    response = requests.post(url, data=ndjson_string, headers=headers, params=params, timeout=10)
     if not response.ok:
-        Path("failed_submission.json").write_text(
-            ndjson_string, encoding="utf-8")
+        Path("failed_submission.json").write_text(ndjson_string, encoding="utf-8")
         msg = (
             f"Submitting processed data failed. Status code: {
                 response.status_code}\n"
@@ -394,11 +395,9 @@ def download_nextclade_dataset(dataset_dir: str, config: Config) -> None:
         ]
 
         if config.nextclade_dataset_tag is not None:
-            dataset_download_command.append(
-                f"--tag={config.nextclade_dataset_tag}")
+            dataset_download_command.append(f"--tag={config.nextclade_dataset_tag}")
 
-        logging.info("Downloading Nextclade dataset: %s",
-                     dataset_download_command)
+        logging.info("Downloading Nextclade dataset: %s", dataset_download_command)
         if subprocess.run(dataset_download_command, check=False).returncode != 0:  # noqa: S603
             msg = "Dataset download failed"
             raise RuntimeError(msg)
@@ -413,12 +412,10 @@ def run(config: Config) -> None:
         total_processed = 0
         while True:
             logging.debug("Fetching unprocessed sequences")
-            unprocessed = fetch_unprocessed_sequences(
-                config.batch_size, config)
+            unprocessed = fetch_unprocessed_sequences(config.batch_size, config)
             if len(unprocessed) == 0:
                 # sleep 1 sec and try again
-                logging.debug(
-                    "No unprocessed sequences found. Sleeping for 1 second.")
+                logging.debug("No unprocessed sequences found. Sleeping for 1 second.")
                 time.sleep(1)
                 continue
             # Process the sequences, get result as dictionary
@@ -427,8 +424,7 @@ def run(config: Config) -> None:
             try:
                 submit_processed_sequences(processed, config)
             except RuntimeError as e:
-                logging.exception(
-                    "Submitting processed data failed. Traceback : %s", e)
+                logging.exception("Submitting processed data failed. Traceback : %s", e)
                 continue
             total_processed += len(processed)
             logging.info("Processed %s sequences", len(processed))
diff --git a/preprocessing/nextstrain/cchf/L/genome_annotation.gff3 b/preprocessing/nextstrain/cchf/L/genome_annotation.gff3
@@ -1,3 +1,9 @@
 ##gff-version 3
+#!gff-spec-version 1.21
+#!processor NCBI annotwriter
 ##sequence-region NC_005301.3 1 12108
-NC_005301.3	feature	gene	77	11914	.	+	.	gene_name="RdRp"
+##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=3052518
+NC_005301.3	RefSeq	region	1	12108	.	+	.	ID=NC_005301.3:1..12108;Dbxref=taxon:3052518;Name=L;gbkey=Src;genome=genomic;mol_type=genomic RNA;old-name=Crimean-Congo hemorrhagic fever virus;segment=L;strain=IbAr10200
+NC_005301.3	RefSeq	gene	77	11914	.	+	.	gene_name="RdRp";ID=gene-CCFHV_sLgp1;Dbxref=GeneID:2943075;Name=RdRp;gbkey=Gene;gene=RdRp;gene_biotype=protein_coding;locus_tag=CCFHV_sLgp1
+NC_005301.3	RefSeq	CDS	77	11914	.	+	0	ID=cds-YP_325663.1;Parent=gene-CCFHV_sLgp1;Dbxref=GenBank:YP_325663.1,GeneID:2943075;Name=YP_325663.1;Note=putative polymerase and helicase function%3B putative RNA-dependent RNA polymerase%3B contains OTU-like cysteine protease motif;gbkey=CDS;gene=RdRp;locus_tag=CCFHV_sLgp1;product=putative polyprotein;protein_id=YP_325663.1
+
diff --git a/preprocessing/nextstrain/cchf/M/genome_annotation.gff3 b/preprocessing/nextstrain/cchf/M/genome_annotation.gff3
@@ -1,3 +1,10 @@
 ##gff-version 3
+#!gff-spec-version 1.21
+#!processor NCBI annotwriter
 ##sequence-region NC_005300.2 1 5366
-NC_005300.2	feature	gene	93	5147	.	+	.	gene_name="GPC"
+##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=3052518
+NC_005300.2	RefSeq	region	1	5366	.	+	.	ID=NC_005300.2:1..5366;Dbxref=taxon:3052518;Name=M;country=Nigeria: Sokoto;gbkey=Src;genome=genomic;mol_type=genomic RNA;nat-host=Hyalomma excavatum;note=from YARU%2C New Haven%2C Conn. SMB passed 13 time in suckling mouse brain followed by 2 passes in CDC Vero E6 cells;old-name=Crimean-Congo hemorrhagic fever virus;segment=M;strain=IbAr10200
+NC_005300.2	RefSeq	primer_binding_site	1	18	.	+	.	ID=id-NC_005300.2:1..18;gbkey=primer_bind
+NC_005300.2	RefSeq	gene	93	5147	.	+	.	gene_name="GPC";ID=gene-CCFHV_sMgp1;Dbxref=GeneID:2943074;Name=CCFHV_sMgp1;gbkey=Gene;gene_biotype=protein_coding;locus_tag=CCFHV_sMgp1
+NC_005300.2	RefSeq	CDS	93	5147	.	+	0	ID=cds-NP_950235.1;Parent=gene-CCFHV_sMgp1;Dbxref=GenBank:NP_950235.1,GeneID:2943074;Name=NP_950235.1;gbkey=CDS;locus_tag=CCFHV_sMgp1;product=glycoprotein precursor;protein_id=NP_950235.1
+NC_005300.2	RefSeq	primer_binding_site	5349	5366	.	-	.	ID=id-NC_005300.2:5349..5366;gbkey=primer_bind
diff --git a/preprocessing/nextstrain/cchf/S/genome_annotation.gff3 b/preprocessing/nextstrain/cchf/S/genome_annotation.gff3
@@ -1,3 +1,8 @@
 ##gff-version 3
-##sequence-region NC_005302.1  1 1672
-NC_005302.1	feature	gene	56	1504	.	+	.	gene_name="NP"
+#!gff-spec-version 1.21
+#!processor NCBI annotwriter
+##sequence-region NC_005302.1 1 1672
+##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=3052518
+NC_005302.1	RefSeq	region	1	1672	.	+	.	ID=NC_005302.1:1..1672;Dbxref=taxon:3052518;Name=S;gbkey=Src;genome=genomic;mol_type=genomic RNA;old-name=Crimean-Congo hemorrhagic fever virus;segment=S;strain=10200
+NC_005302.1	RefSeq	gene	56	1504	.	+	.	gene_name="NP";ID=gene-CCFHVsSgp1;Dbxref=GeneID:2943076;Name=CCFHVsSgp1;gbkey=Gene;gene_biotype=protein_coding;locus_tag=CCFHVsSgp1
+NC_005302.1	RefSeq	CDS	56	1504	.	+	0	ID=cds-NP_950237.1;Parent=gene-CCFHVsSgp1;Dbxref=GenBank:NP_950237.1,GeneID:2943076;Name=NP_950237.1;gbkey=CDS;locus_tag=CCFHVsSgp1;product=nucleoprotein;protein_id=NP_950237.1