From 34be11c76ecd13544df3b4393adae3b7919ead18 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Mon, 10 Jun 2024 17:21:21 +0200
Subject: [PATCH] Format with ruff, fix mypy errors, update dev_dependencies

---
 preprocessing/nextclade/dev_dependencies.txt  |  2 ++
 .../src/loculus_preprocessing/backend.py      | 10 +++++-----
 .../src/loculus_preprocessing/config.py       |  2 +-
 .../src/loculus_preprocessing/prepro.py       | 20 ++++++++++---------
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/preprocessing/nextclade/dev_dependencies.txt b/preprocessing/nextclade/dev_dependencies.txt
index c532e08f3..fc02661e0 100644
--- a/preprocessing/nextclade/dev_dependencies.txt
+++ b/preprocessing/nextclade/dev_dependencies.txt
@@ -2,3 +2,5 @@ mypy
 ruff
 types-PyYAML
 types-requests
+types-pytz
+types-python-dateutil
\ No newline at end of file
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
index efd02b5ef..8a6028b56 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/backend.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
@@ -1,17 +1,17 @@
 """Functions to interface with the backend"""
 
+import dataclasses
 import datetime as dt
+import json
 import logging
+import time
+from collections.abc import Sequence
 from http import HTTPStatus
-import dataclasses
-import json
 from pathlib import Path
 
 import jwt
 import pytz
 import requests
-import time
-from collections.abc import Sequence
 
 from .config import Config
 from .datatypes import (
@@ -76,7 +76,7 @@ def fetch_unprocessed_sequences(n: int, config: Config) -> str:
         if response.status_code == HTTPStatus.UNPROCESSABLE_ENTITY:
             logging.debug(f"{response.text}.\nSleeping for a while.")
             time.sleep(60 * 1)
-            return []
+            return ""
         msg = f"Fetching unprocessed data failed. Status code: {
             response.status_code}"
         raise Exception(
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
index 0c3647236..b32ea1f78 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/config.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -18,7 +18,7 @@
 @dataclass
 class Config:
     organism: str = "mpox"
-    backend_host: str = None  # Set default to None or similar placeholder
+    backend_host: str = ""
     keycloak_host: str = "http://127.0.0.1:8083"
     keycloak_user: str = "preprocessing_pipeline"
     keycloak_password: str = "preprocessing_pipeline"
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index e53f44a66..d71c61164 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -21,7 +21,6 @@
     AnnotationSource,
     AnnotationSourceType,
     GeneName,
-    SegmentName,
     InputMetadata,
     NucleotideInsertion,
     NucleotideSequence,
@@ -31,6 +30,7 @@
     ProcessingAnnotation,
     ProcessingResult,
     ProcessingSpec,
+    SegmentName,
     UnprocessedAfterNextclade,
     UnprocessedData,
     UnprocessedEntry,
@@ -195,7 +195,10 @@ def enrich_with_nextclade(
             logging.debug("Nextclade results available in %s", result_dir)
 
             # Add aligned sequences to aligned_nucleotide_sequences
-            load_aligned_nuc_sequences(result_dir_seg, segment, aligned_nucleotide_sequences)
+            # Modifies aligned_nucleotide_sequences in place
+            aligned_nucleotide_sequences = load_aligned_nuc_sequences(
+                result_dir_seg, segment, aligned_nucleotide_sequences
+            )
 
             for gene in config.genes:
                 translation_path = result_dir_seg + f"/nextclade.cds_translation.{gene}.fasta"
@@ -273,7 +276,7 @@ def load_aligned_nuc_sequences(
     aligned_nucleotide_sequences: dict[
         AccessionVersion, dict[SegmentName, NucleotideSequence | None]
     ],
-) -> dict[AccessionVersion, NucleotideSequence]:
+) -> dict[AccessionVersion, dict[SegmentName, NucleotideSequence | None]]:
     """
     Load the nextclade alignment results into the aligned_nucleotide_sequences dict, mapping each
     accession to a segmentName: NucleotideSequence dictionary.
@@ -284,6 +287,7 @@ def load_aligned_nuc_sequences(
             sequence_id: str = aligned_sequence.id
             sequence: NucleotideSequence = str(aligned_sequence.seq)
             aligned_nucleotide_sequences[sequence_id][segment] = mask_terminal_gaps(sequence)
+    return aligned_nucleotide_sequences
 
 
 def accession_from_str(id_str: AccessionVersion) -> str:
@@ -318,6 +322,8 @@ def get_metadata(
         nextclade_prefix = "nextclade."
         if input_path.startswith(nextclade_prefix):
             # Remove "nextclade." prefix
+            if spec.args is None:
+                spec.args = {}
             segment = spec.args.get("segment", "main")
             if unprocessed.nextcladeMetadata is None:
                 errors.append(
@@ -374,15 +380,11 @@ def process_single(
     """Process a single sequence per config"""
     errors: list[ProcessingAnnotation] = []
     warnings: list[ProcessingAnnotation] = []
-    len_dict: dict[str, str | int] = {}
+    output_metadata: ProcessedMetadata = {}
     for segment in config.nucleotideSequences:
         sequence = unprocessed.unalignedNucleotideSequences[segment]
         key = "length" if segment == "main" else "length_" + segment
-        if sequence:
-            len_dict[key] = len(sequence)
-        else:
-            len_dict[key] = 0
-    output_metadata: ProcessedMetadata = len_dict
+        output_metadata[key] = len(sequence) if sequence else 0
 
     for output_field, spec_dict in config.processing_spec.items():
         length_fields = [