Format with ruff, fix mypy errors, update dev_dependencies

loculus-project · Jun 10, 2024 · 34be11c · 34be11c
1 parent 7f45720
commit 34be11c
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 15 deletions.
diff --git a/preprocessing/nextclade/dev_dependencies.txt b/preprocessing/nextclade/dev_dependencies.txt
@@ -2,3 +2,5 @@ mypy
 ruff
 types-PyYAML
 types-requests
+types-pytz
+types-python-dateutil
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
@@ -1,17 +1,17 @@
 """Functions to interface with the backend"""
 
+import dataclasses
 import datetime as dt
+import json
 import logging
+import time
+from collections.abc import Sequence
 from http import HTTPStatus
-import dataclasses
-import json
 from pathlib import Path
 
 import jwt
 import pytz
 import requests
-import time
-from collections.abc import Sequence
 
 from .config import Config
 from .datatypes import (
@@ -76,7 +76,7 @@ def fetch_unprocessed_sequences(n: int, config: Config) -> str:
         if response.status_code == HTTPStatus.UNPROCESSABLE_ENTITY:
             logging.debug(f"{response.text}.\nSleeping for a while.")
             time.sleep(60 * 1)
-            return []
+            return ""
         msg = f"Fetching unprocessed data failed. Status code: {
             response.status_code}"
         raise Exception(

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -18,7 +18,7 @@
 @dataclass
 class Config:
     organism: str = "mpox"
-    backend_host: str = None  # Set default to None or similar placeholder
+    backend_host: str = ""
     keycloak_host: str = "http://127.0.0.1:8083"
     keycloak_user: str = "preprocessing_pipeline"
     keycloak_password: str = "preprocessing_pipeline"

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -21,7 +21,6 @@
     AnnotationSource,
     AnnotationSourceType,
     GeneName,
-    SegmentName,
     InputMetadata,
     NucleotideInsertion,
     NucleotideSequence,
@@ -31,6 +30,7 @@
     ProcessingAnnotation,
     ProcessingResult,
     ProcessingSpec,
+    SegmentName,
     UnprocessedAfterNextclade,
     UnprocessedData,
     UnprocessedEntry,
@@ -195,7 +195,10 @@ def enrich_with_nextclade(
             logging.debug("Nextclade results available in %s", result_dir)
 
             # Add aligned sequences to aligned_nucleotide_sequences
-            load_aligned_nuc_sequences(result_dir_seg, segment, aligned_nucleotide_sequences)
+            # Modifies aligned_nucleotide_sequences in place
+            aligned_nucleotide_sequences = load_aligned_nuc_sequences(
+                result_dir_seg, segment, aligned_nucleotide_sequences
+            )
 
             for gene in config.genes:
                 translation_path = result_dir_seg + f"/nextclade.cds_translation.{gene}.fasta"
@@ -273,7 +276,7 @@ def load_aligned_nuc_sequences(
     aligned_nucleotide_sequences: dict[
         AccessionVersion, dict[SegmentName, NucleotideSequence | None]
     ],
-) -> dict[AccessionVersion, NucleotideSequence]:
+) -> dict[AccessionVersion, dict[SegmentName, NucleotideSequence | None]]:
     """
     Load the nextclade alignment results into the aligned_nucleotide_sequences dict, mapping each
     accession to a segmentName: NucleotideSequence dictionary.
@@ -284,6 +287,7 @@ def load_aligned_nuc_sequences(
             sequence_id: str = aligned_sequence.id
             sequence: NucleotideSequence = str(aligned_sequence.seq)
             aligned_nucleotide_sequences[sequence_id][segment] = mask_terminal_gaps(sequence)
+    return aligned_nucleotide_sequences
 
 
 def accession_from_str(id_str: AccessionVersion) -> str:
@@ -318,6 +322,8 @@ def get_metadata(
         nextclade_prefix = "nextclade."
         if input_path.startswith(nextclade_prefix):
             # Remove "nextclade." prefix
+            if spec.args is None:
+                spec.args = {}
             segment = spec.args.get("segment", "main")
             if unprocessed.nextcladeMetadata is None:
                 errors.append(
@@ -374,15 +380,11 @@ def process_single(
     """Process a single sequence per config"""
     errors: list[ProcessingAnnotation] = []
     warnings: list[ProcessingAnnotation] = []
-    len_dict: dict[str, str | int] = {}
+    output_metadata: ProcessedMetadata = {}
     for segment in config.nucleotideSequences:
         sequence = unprocessed.unalignedNucleotideSequences[segment]
         key = "length" if segment == "main" else "length_" + segment
-        if sequence:
-            len_dict[key] = len(sequence)
-        else:
-            len_dict[key] = 0
-    output_metadata: ProcessedMetadata = len_dict
+        output_metadata[key] = len(sequence) if sequence else 0
 
     for output_field, spec_dict in config.processing_spec.items():
         length_fields = [