loculus-project · corneliusroemer · Jun 10, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/docs/src/content/docs/guides/getting-started.md b/docs/src/content/docs/guides/getting-started.md
@@ -169,12 +169,12 @@ organisms:
         - name: length
           type: int
           header: "Length"
-          per_segment: true
+          perSegment: true
 ```
 
 Additionally, if you are using the preprocessing or ingest pipelines, `nucleotideSequences` must also be defined in those sections of the config.
 
-Metadata fields can be isolate- or segment-specific. By default we assume metadata fields are isolate-specific (i.e. are shared across all segments), therefore segment-specific fields must be marked as `per_segment` in the config file. Marking a field as `per_segment` will result in that field existing for each segment. In the example above, instead of there being one metadata field called `length` there will now be three fields called `length_L`, `length_M` and `length_S`.
+Metadata fields can be isolate- or segment-specific. By default we assume metadata fields are isolate-specific (i.e. are shared across all segments), therefore segment-specific fields must be marked as `perSegment` in the config file. Marking a field as `perSegment` will result in that field existing for each segment. In the example above, instead of there being one metadata field called `length` there will now be three fields called `length_L`, `length_M` and `length_S`.
 
 Loculus expects multi-segmented pathogen sequences to be submitted in a specific format. Fasta files should have a separate entry/record for each segment, with a Fasta header of `>[submissionID]_[segmentName]`, e.g. `>sample123_L` for the `L` segment of the sample with the submissionID `sample123`. Metadata is uploaded for an entire sequence entry, rather than per segment, i.e. there will be only one row for each `submissionID`.
 

diff --git a/kubernetes/loculus/templates/_common-metadata.tpl b/kubernetes/loculus/templates/_common-metadata.tpl
@@ -148,7 +148,7 @@ fields:
 {{- end }}
 {{- range $metadataList }}
 {{- $currentItem := . }}
-{{- if and $use_segments .per_segment }}
+{{- if and $use_segments .perSegment }}
   {{- range $segment := $segments }}
     - name: {{ printf "%s_%s" $currentItem.name $segment | quote }}
       type: {{ $currentItem.type | default "string" | quote }}
@@ -251,8 +251,8 @@ fields:
 {{- end }}
 {{- range $metadataList }}
 {{- $currentItem := . }}
-{{- $per_segment := (.per_segment | default false )}}
-{{- if and $use_segments $per_segment }}
+{{- $perSegment := (.perSegment | default false )}}
+{{- if and $use_segments $perSegment }}
 {{- range $segment := $segments }}
   - name: {{ printf "%s_%s" $currentItem.name $segment | quote }}
     type: {{ $currentItem.type | default "string" | quote }}

diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
@@ -11,13 +11,14 @@
 
 {{- range $metadata }}
 {{- $currentItem := . }}
-{{- if and $use_segments .per_segment }}
+{{- if and $use_segments .perSegment }}
 {{- range $segment := $segments }}
-{{ printf "%s_%s :" $currentItem.name $segment}}
-  {{- if $currentItem.type }}
+{{ printf "%s_%s:" $currentItem.name $segment}}
   args:
+    segment: {{ $segment }}
+    {{- if $currentItem.type }}
     type: {{ $currentItem.type }}
-  {{- end }}
+    {{- end }}
   {{- if $currentItem.preprocessing }}
   {{- if hasKey $currentItem.preprocessing "function" }}
   function: {{ index $currentItem.preprocessing "function" }}
@@ -38,7 +39,7 @@
 {{- end}}
 
 {{- else }}
-{{ printf "%s :" .name }}
+{{ printf "%s:" .name }}
   {{- if .type }}
   args:
     type: {{ .type }}

diff --git a/kubernetes/loculus/templates/lapis-silo-database-config.yaml b/kubernetes/loculus/templates/lapis-silo-database-config.yaml
@@ -24,7 +24,7 @@ data:
       metadata:
       {{- range (concat $commonMetadata .metadata) }}
       {{- $currentItem := . }}
-      {{- if and $use_segments .per_segment }}
+      {{- if and $use_segments .perSegment }}
         {{- range $segment := $nucleotideSequences }}
         - name: {{ printf "%s_%s" $currentItem.name $segment | quote }}
           {{- $type := default "string" $currentItem.type }}

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -143,13 +143,13 @@ defaultOrganismConfig: &defaultOrganismConfig
         header: "INSDC"
         hideOnSequenceDetailsPage: true
         noInput: true
-        per_segment: true
+        perSegment: true
       - name: insdc_version
         type: int
         header: "INSDC"
         hideOnSequenceDetailsPage: true
         noInput: true
-        per_segment: true
+        perSegment: true
       - name: insdc_accession_full
         displayName: INSDC accession
         customDisplay:
@@ -158,22 +158,22 @@ defaultOrganismConfig: &defaultOrganismConfig
         header: "INSDC"
         ingest: genbank_accession
         noInput: true
-        per_segment: true
+        perSegment: true
       - name: bioproject_accessions
         customDisplay:
           type: link
           url: "https://www.ncbi.nlm.nih.gov/bioproject/__value__"
         header: "INSDC"
         ingest: bioprojects
         noInput: true
-        per_segment: true
+        perSegment: true
       - name: biosample_accession
         customDisplay:
           type: link
           url: "https://www.ncbi.nlm.nih.gov/biosample/__value__"
         header: "INSDC" 
         noInput: true
-        per_segment: true
+        perSegment: true
       - name: culture_id
         displayName: Culture ID
         header: Sample details
@@ -698,13 +698,13 @@ defaultOrganismConfig: &defaultOrganismConfig
         autocomplete: true
         header: "INSDC"
         noInput: true
-        per_segment: true
+        perSegment: true
       - name: length
         type: int
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
-        per_segment: true
+        perSegment: true
       - name: host_name_scientific
         generateIndex: true
         autocomplete: true
@@ -776,60 +776,61 @@ defaultOrganismConfig: &defaultOrganismConfig
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        perSegment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalSubstitutions }
       - name: total_inserted_nucs
         type: int
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        perSegment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalInsertions }
       - name: total_deleted_nucs
         type: int
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        perSegment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalDeletions }
       - name: total_ambiguous_nucs
         type: int
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        perSegment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: "nextclade.totalNonACGTNs" }
       - name: total_unknown_nucs
         type: int
         header: "Alignment states and QC metrics"
         noInput: true
         rangeSearch: true
+        perSegment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalMissing }
       - name: total_frame_shifts
         type: int
         rangeSearch: true
         header: "Alignment states and QC metrics"
         noInput: true
+        perSegment: true
         preprocessing:
-          args: { type: int }
           inputs: { input: nextclade.totalFrameShifts }
       - name: frame_shifts
         header: "Alignment states and QC metrics"
         noInput: true
+        perSegment: true
         preprocessing:
           inputs: { input: nextclade.frameShifts }
       - name: completeness
         type: float
         header: "Alignment states and QC metrics"
         noInput: true
+        perSegment: true
         preprocessing:
-          args: { type: float }
           inputs: { input: nextclade.coverage }
     website: &website
       tableColumns:
@@ -910,7 +911,6 @@ defaultOrganisms:
           header: "Alignment states and QC metrics"
           noInput: true
           preprocessing:
-            args: { type: int }
             inputs: { input: nextclade.qc.stopCodons.totalStopCodons }
         - name: stop_codons
           header: "Alignment states and QC metrics"
@@ -985,7 +985,6 @@ defaultOrganisms:
           header: "Alignment states and QC metrics"
           noInput: true
           preprocessing:
-            args: { type: int }
             inputs: { input: nextclade.qc.stopCodons.totalStopCodons }
         - name: stop_codons
           header: "Alignment states and QC metrics"
@@ -1046,7 +1045,6 @@ defaultOrganisms:
           header: "Alignment states and QC metrics"
           noInput: true
           preprocessing:
-            args: { type: int }
             inputs: { input: nextclade.qc.stopCodons.totalStopCodons }
         - name: stop_codons
           header: "Alignment states and QC metrics"

diff --git a/preprocessing/nextclade/dev_dependencies.txt b/preprocessing/nextclade/dev_dependencies.txt
@@ -2,3 +2,5 @@ mypy
 ruff
 types-PyYAML
 types-requests
+types-pytz
+types-python-dateutil
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py
@@ -1,13 +1,22 @@
 """Functions to interface with the backend"""
 
+import dataclasses
 import datetime as dt
+import json
 import logging
+import time
+from collections.abc import Sequence
+from http import HTTPStatus
+from pathlib import Path
 
 import jwt
 import pytz
 import requests
 
 from .config import Config
+from .datatypes import (
+    ProcessedEntry,
+)
 
 
 class JwtCache:
@@ -55,3 +64,52 @@ def get_jwt(config: Config) -> str:
         error_msg = f"Fetching JWT failed with status code {response.status_code}: {response.text}"
         logging.error(error_msg)
         raise Exception(error_msg)
+
+
+def fetch_unprocessed_sequences(n: int, config: Config) -> str:
+    url = config.backend_host.rstrip("/") + "/extract-unprocessed-data"
+    logging.debug(f"Fetching {n} unprocessed sequences from {url}")
+    params = {"numberOfSequenceEntries": n, "pipelineVersion": config.pipeline_version}
+    headers = {"Authorization": "Bearer " + get_jwt(config)}
+    response = requests.post(url, data=params, headers=headers, timeout=10)
+    if not response.ok:
+        if response.status_code == HTTPStatus.UNPROCESSABLE_ENTITY:
+            logging.debug(f"{response.text}.\nSleeping for a while.")
+            time.sleep(60 * 1)
+            return ""
+        msg = f"Fetching unprocessed data failed. Status code: {
+            response.status_code}"
+        raise Exception(
+            msg,
+            response.text,
+        )
+    return response.text
+
+
+def submit_processed_sequences(
+    processed: Sequence[ProcessedEntry], dataset_dir: str, config: Config
+) -> None:
+    json_strings = [json.dumps(dataclasses.asdict(sequence)) for sequence in processed]
+    if config.keep_tmp_dir:
+        # For debugging: write all submit requests to submission_requests.json
+        with open(dataset_dir + "/submission_requests.json", "w", encoding="utf-8") as f:
+            for seq in processed:
+                json.dump(dataclasses.asdict(seq), f)
+    ndjson_string = "\n".join(json_strings)
+    url = config.backend_host.rstrip("/") + "/submit-processed-data"
+    headers = {
+        "Content-Type": "application/x-ndjson",
+        "Authorization": "Bearer " + get_jwt(config),
+    }
+    params = {"pipelineVersion": config.pipeline_version}
+    response = requests.post(url, data=ndjson_string, headers=headers, params=params, timeout=10)
+    if not response.ok:
+        Path("failed_submission.json").write_text(ndjson_string, encoding="utf-8")
+        msg = (
+            f"Submitting processed data failed. Status code: {
+                response.status_code}\n"
+            f"Response: {response.text}\n"
+            f"Data sent in request: {ndjson_string[0:1000]}...\n"
+        )
+        raise RuntimeError(msg)
+    logging.info("Processed data submitted successfully")
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -18,7 +18,7 @@
 @dataclass
 class Config:
     organism: str = "mpox"
-    backend_host: str = None  # Set default to None or similar placeholder
+    backend_host: str = ""
     keycloak_host: str = "http://127.0.0.1:8083"
     keycloak_user: str = "preprocessing_pipeline"
     keycloak_password: str = "preprocessing_pipeline"

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -5,6 +5,7 @@
 
 AccessionVersion = str
 GeneName = str
+SegmentName = str
 NucleotideSequence = str
 AminoAcidSequence = str
 NucleotideInsertion = str
@@ -66,10 +67,10 @@ class ProcessingSpec:
 class UnprocessedAfterNextclade:
     inputMetadata: InputMetadata
     # Derived metadata produced by Nextclade
-    nextcladeMetadata: dict[str, Any] | None
-    unalignedNucleotideSequences: dict[str, NucleotideSequence | None]
-    alignedNucleotideSequences: dict[str, NucleotideSequence | None]
-    nucleotideInsertions: dict[str, list[NucleotideInsertion]]
+    nextcladeMetadata: dict[SegmentName, Any] | None
+    unalignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None]
+    alignedNucleotideSequences: dict[SegmentName, NucleotideSequence | None]
+    nucleotideInsertions: dict[SegmentName, list[NucleotideInsertion]]
     alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None]
     aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]]