loculus-project · corneliusroemer · Jun 10, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
@@ -13,11 +13,12 @@
 {{- $currentItem := . }}
 {{- if and $use_segments .per_segment }}
 {{- range $segment := $segments }}
-{{ printf "%s_%s :" $currentItem.name $segment}}
-  {{- if $currentItem.type }}
+{{ printf "%s_%s:" $currentItem.name $segment}}
   args:
+    segment: {{ $segment }}
+    {{- if $currentItem.type }}
     type: {{ $currentItem.type }}
-  {{- end }}
+    {{- end }}
   {{- if $currentItem.preprocessing }}
   {{- if hasKey $currentItem.preprocessing "function" }}
   function: {{ index $currentItem.preprocessing "function" }}
@@ -38,7 +39,7 @@
 {{- end}}
 
 {{- else }}
-{{ printf "%s :" .name }}
+{{ printf "%s:" .name }}
   {{- if .type }}
   args:
     type: {{ .type }}

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -776,60 +776,61 @@ defaultOrganismConfig: &defaultOrganismConfig
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        per_segment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalSubstitutions }
       - name: total_inserted_nucs
         type: int
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        per_segment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalInsertions }
       - name: total_deleted_nucs
         type: int
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        per_segment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalDeletions }
       - name: total_ambiguous_nucs
         type: int
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        per_segment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: "nextclade.totalNonACGTNs" }
       - name: total_unknown_nucs
         type: int
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        per_segment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalMissing }
       - name: total_frame_shifts
         type: int
         rangeSearch: true
         header: "Alignment states and QC metrics"
         noInput: true
+        per_segment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalFrameShifts }
       - name: frame_shifts
         header: "Alignment states and QC metrics"
         noInput: true
+        per_segment: true
         preprocessing:
           inputs: { input: nextclade.frameShifts }
       - name: completeness
         type: float
         header: "Alignment states and QC metrics"
         noInput: true
+        per_segment: true
         preprocessing:
-          args: { type: float }
           inputs: { input: nextclade.coverage }
     website: &website
       tableColumns:
@@ -910,7 +911,6 @@ defaultOrganisms:
           header: "Alignment states and QC metrics"
           noInput: true
           preprocessing:
-            args: { type: int }
             inputs: { input: nextclade.qc.stopCodons.totalStopCodons }
         - name: stop_codons
           header: "Alignment states and QC metrics"
@@ -985,7 +985,6 @@ defaultOrganisms:
           header: "Alignment states and QC metrics"
           noInput: true
           preprocessing:
-            args: { type: int }
             inputs: { input: nextclade.qc.stopCodons.totalStopCodons }
         - name: stop_codons
           header: "Alignment states and QC metrics"
@@ -1046,7 +1045,6 @@ defaultOrganisms:
           header: "Alignment states and QC metrics"
           noInput: true
           preprocessing:
-            args: { type: int }
             inputs: { input: nextclade.qc.stopCodons.totalStopCodons }
         - name: stop_codons
           header: "Alignment states and QC metrics"

diff --git a/preprocessing/nextclade/dev_dependencies.txt b/preprocessing/nextclade/dev_dependencies.txt
@@ -2,3 +2,5 @@ mypy
 ruff
 types-PyYAML
 types-requests
+types-pytz
+types-python-dateutil
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
@@ -1,13 +1,22 @@
 """Functions to interface with the backend"""
 
+import dataclasses
 import datetime as dt
+import json
 import logging
+import time
+from collections.abc import Sequence
+from http import HTTPStatus
+from pathlib import Path
 
 import jwt
 import pytz
 import requests
 
 from .config import Config
+from .datatypes import (
+    ProcessedEntry,
+)
 
 
 class JwtCache:
@@ -55,3 +64,52 @@ def get_jwt(config: Config) -> str:
         error_msg = f"Fetching JWT failed with status code {response.status_code}: {response.text}"
         logging.error(error_msg)
         raise Exception(error_msg)
+
+
+def fetch_unprocessed_sequences(n: int, config: Config) -> str:
+    url = config.backend_host.rstrip("/") + "/extract-unprocessed-data"
+    logging.debug(f"Fetching {n} unprocessed sequences from {url}")
+    params = {"numberOfSequenceEntries": n, "pipelineVersion": config.pipeline_version}
+    headers = {"Authorization": "Bearer " + get_jwt(config)}
+    response = requests.post(url, data=params, headers=headers, timeout=10)
+    if not response.ok:
+        if response.status_code == HTTPStatus.UNPROCESSABLE_ENTITY:
+            logging.debug(f"{response.text}.\nSleeping for a while.")
+            time.sleep(60 * 1)
+            return ""
+        msg = f"Fetching unprocessed data failed. Status code: {
+            response.status_code}"
+        raise Exception(
+            msg,
+            response.text,
+        )
+    return response.text
+
+
+def submit_processed_sequences(
+    processed: Sequence[ProcessedEntry], dataset_dir: str, config: Config
+) -> None:
+    json_strings = [json.dumps(dataclasses.asdict(sequence)) for sequence in processed]
+    if config.keep_tmp_dir:
+        # For debugging: write all submit requests to submission_requests.json
+        with open(dataset_dir + "/submission_requests.json", "w", encoding="utf-8") as f:
+            for seq in processed:
+                json.dump(dataclasses.asdict(seq), f)
+    ndjson_string = "\n".join(json_strings)
+    url = config.backend_host.rstrip("/") + "/submit-processed-data"
+    headers = {
+        "Content-Type": "application/x-ndjson",
+        "Authorization": "Bearer " + get_jwt(config),
+    }
+    params = {"pipelineVersion": config.pipeline_version}
+    response = requests.post(url, data=ndjson_string, headers=headers, params=params, timeout=10)
+    if not response.ok:
+        Path("failed_submission.json").write_text(ndjson_string, encoding="utf-8")
+        msg = (
+            f"Submitting processed data failed. Status code: {
+                response.status_code}\n"
+            f"Response: {response.text}\n"
+            f"Data sent in request: {ndjson_string[0:1000]}...\n"
+        )
+        raise RuntimeError(msg)
+    logging.info("Processed data submitted successfully")
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -18,7 +18,7 @@
 @dataclass
 class Config:
     organism: str = "mpox"
-    backend_host: str = None  # Set default to None or similar placeholder
+    backend_host: str = ""
     keycloak_host: str = "http://127.0.0.1:8083"
     keycloak_user: str = "preprocessing_pipeline"
     keycloak_password: str = "preprocessing_pipeline"

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -5,6 +5,7 @@
 
 AccessionVersion = str
 GeneName = str
+SegmentName = str
 NucleotideSequence = str
 AminoAcidSequence = str
 NucleotideInsertion = str
@@ -66,10 +67,10 @@ class ProcessingSpec:
 class UnprocessedAfterNextclade:
     inputMetadata: InputMetadata
     # Derived metadata produced by Nextclade
-    nextcladeMetadata: dict[str, Any] | None
-    unalignedNucleotideSequences: dict[str, NucleotideSequence | None]
-    alignedNucleotideSequences: dict[str, NucleotideSequence | None]
-    nucleotideInsertions: dict[str, list[NucleotideInsertion]]
+    nextcladeMetadata: dict[SegmentName, Any] | None
+    unalignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None]
+    alignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None]
+    nucleotideInsertions: dict[SegmentName, list[NucleotideInsertion]]
     alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None]
     aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]]