diff --git a/bin/transform-rki b/bin/transform-rki index c91ab370..3e632dd9 100755 --- a/bin/transform-rki +++ b/bin/transform-rki @@ -29,12 +29,12 @@ from lib.utils.transformpipeline.transforms import (AddHardcodedMetadataRki, UserProvidedAnnotations) COLUMN_MAP = { - "SEQUENCE.DATE_OF_SAMPLING": "date", - "SEQUENCE.PUSHED_TO_DWH": "date_submitted", - "DL.ID": "originating_lab", - "SL.ID": "submitting_lab", - "PANGOLIN.LINEAGE_LATEST": "pango_lineage", - "SEQUENCE.SEQUENCING_REASON": "sampling_strategy", + "date_of_sampling": "date", + "date_of_submission": "date_submitted", + "prime_diagnostic_lab.demis_lab_id": "originating_lab", + "sequencing_lab.demis_lab_id": "submitting_lab", + "lineages": "pango_lineage", + "sequencing_reason": "sampling_strategy", } diff --git a/bin/transform-rki-data-to-ndjson b/bin/transform-rki-data-to-ndjson index e860a126..7d943686 100755 --- a/bin/transform-rki-data-to-ndjson +++ b/bin/transform-rki-data-to-ndjson @@ -5,7 +5,7 @@ Turn RKI files into ndjson format import typer -RKI_INDEX_COL = "SEQUENCE.ID" +RKI_INDEX_COL = "igs_id" def main( input_rki_sequences: str = typer.Option(..., help="Input file"), diff --git a/lib/utils/transformpipeline/transforms.py b/lib/utils/transformpipeline/transforms.py index 681e7ca0..2e2024d0 100644 --- a/lib/utils/transformpipeline/transforms.py +++ b/lib/utils/transformpipeline/transforms.py @@ -1,6 +1,7 @@ import csv import re import unicodedata +import json from collections import defaultdict from typing import Any, Collection, List, MutableMapping, Sequence, Tuple , Dict , Union import pandas as pd @@ -288,6 +289,13 @@ def transform_value(self, entry: dict) -> dict: entry['sequence'] = entry['sequence'].replace('\n', '') entry['length'] = len(entry['sequence']) + # Pull out latest pango lineage from json blob + # Currently this pulls the first entry, but we've added an assert statement to see if there are ever more than one entry + # At that time, we can loop over the json blob to find the latest pango lineage assignment + lineage_json_blob = json.loads(entry['pango_lineage']) + entry['pango_lineage'] = lineage_json_blob[0]['lineage'] + assert len(lineage_json_blob)==1, f"RKI pango_lineage unexpectedly had more than one entry. rki_accession: {entry['rki_accession']}" + # Normalize all string data to Unicode Normalization Form C, for # consistent, predictable string comparisons. str_kvs = { @@ -299,7 +307,7 @@ def transform_value(self, entry: dict) -> dict: # Standardize date format to ISO 8601 date date_columns = {'date', 'date_submitted'} - date_formats = {'%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S %z'} + date_formats = {'%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S %z', '%Y-%m-%dT%H:%M:%S'} for column in date_columns: entry[column] = format_date(entry[column], date_formats)