nextstrain · j23414 · Sep 13, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/bin/transform-rki b/bin/transform-rki
@@ -29,12 +29,12 @@ from lib.utils.transformpipeline.transforms import (AddHardcodedMetadataRki,
                                                     UserProvidedAnnotations)
 
 COLUMN_MAP = {
-    "SEQUENCE.DATE_OF_SAMPLING": "date",
-    "SEQUENCE.PUSHED_TO_DWH": "date_submitted",
-    "DL.ID": "originating_lab",
-    "SL.ID": "submitting_lab",
-    "PANGOLIN.LINEAGE_LATEST": "pango_lineage",
-    "SEQUENCE.SEQUENCING_REASON": "sampling_strategy",
+    "date_of_sampling": "date",
+    "date_of_submission": "date_submitted",
+    "prime_diagnostic_lab.demis_lab_id": "originating_lab",
+    "sequencing_lab.demis_lab_id": "submitting_lab",
+    "lineages": "pango_lineage",
+    "sequencing_reason": "sampling_strategy",
 }
 
 

diff --git a/bin/transform-rki-data-to-ndjson b/bin/transform-rki-data-to-ndjson
@@ -5,7 +5,7 @@ Turn RKI files into ndjson format
 
 import typer
 
-RKI_INDEX_COL = "SEQUENCE.ID"
+RKI_INDEX_COL = "igs_id"
 
 def main(
     input_rki_sequences: str = typer.Option(..., help="Input file"),

diff --git a/lib/utils/transformpipeline/transforms.py b/lib/utils/transformpipeline/transforms.py
@@ -1,6 +1,7 @@
 import csv
 import re
 import unicodedata
+import json
 from collections import defaultdict
 from typing import Any, Collection, List, MutableMapping, Sequence, Tuple , Dict , Union
 import pandas as pd
@@ -288,6 +289,13 @@ def transform_value(self, entry: dict) -> dict:
         entry['sequence'] = entry['sequence'].replace('\n', '')
         entry['length'] = len(entry['sequence'])
 
+        # Pull out latest pango lineage from json blob
+        # Currently this pulls the first entry, but we've added an assert statement to see if there are ever more than one entry
+        # At that time, we can loop over the json blob to find the latest pango lineage assignment
+        lineage_json_blob = json.loads(entry['pango_lineage'])
+        entry['pango_lineage'] = lineage_json_blob[0]['lineage']
+        assert len(lineage_json_blob)==1, f"RKI pango_lineage unexpectedly had more than one entry. rki_accession: {entry['rki_accession']}"
+
         # Normalize all string data to Unicode Normalization Form C, for
         # consistent, predictable string comparisons.
         str_kvs = {
@@ -299,7 +307,7 @@ def transform_value(self, entry: dict) -> dict:
 
         # Standardize date format to ISO 8601 date
         date_columns = {'date', 'date_submitted'}
-        date_formats = {'%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S %z'}
+        date_formats = {'%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S %z', '%Y-%m-%dT%H:%M:%S'}
         for column in date_columns:
             entry[column] = format_date(entry[column], date_formats)