refactor: obds-fhir-to-opal with focus on first POC FA for publication (

#198) * refactor obds-fhir-to-opal with focus on first POC FA for publication * megalinter fixes * test utils * megalinter happy * megalinter please be happy * adds readme * changed volume location in container to match the one in source code * add plotting nbs * use math case pattern for map_gender * adressed megalinter complaints * formatted decompose_xml * fix bandit * fix iteration in R with seq_along --------- Co-authored-by: chgl <[email protected]>
bzkf · Aug 19, 2024 · dae745b · dae745b
1 parent 4c26c98
commit dae745b
Show file tree

Hide file tree

Showing 21 changed files with 1,550 additions and 628 deletions.
diff --git a/.bandit.yml b/.bandit.yml
@@ -1,5 +1,5 @@
 skips:
   - "B405"
   - "B314"
-exclude_dirs:
-  - "*_test.py"
+assert_used:
+  skips: ["**/test_*.py", "**/*_test.py"]
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,5 @@ bundles-delta/
 *.csv
 
 .vscode
+
+.venv/
diff --git a/.mega-linter.yml b/.mega-linter.yml
@@ -11,6 +11,7 @@ DISABLE_LINTERS:
   #  due to import-error
   - PYTHON_PYLINT
   - SPELL_LYCHEE
+  - R_LINTR
 
 SHOW_ELAPSED_TIME: true
 

diff --git a/docker-compose/compose.obds-fhir-to-opal.yaml b/docker-compose/compose.obds-fhir-to-opal.yaml
@@ -19,4 +19,4 @@ services:
       KAFKA_PROCEDURE_TOPIC: "fhir.obds.Procedure"
       KAFKA_MEDICATIONSTATEMENT_TOPIC: "fhir.obds.MedicationStatement"
     volumes:
-      - ${PWD}/opal-output:/opt/bitnami/spark/opal-output
+      - ${PWD}/opal-output:/home/spark/opal-output
diff --git a/src/decompose_xmls/decompose_xmls.py b/src/decompose_xmls/decompose_xmls.py
@@ -1,10 +1,10 @@
 import json
 import os
 import time
-import regex
 import xml.etree.ElementTree as ET
 from io import BytesIO
 
+import regex
 from confluent_kafka import Producer
 from pydantic import BaseSettings
 
@@ -48,8 +48,8 @@ def kafka_delivery_report(err, msg):
 
 
 def remove_leading_zeros(patient_id: str) -> str:
-    if os.environ.get('REMOVE_LEADING_PATIENTID_ZEROS') == 'true':
-        return regex.sub(r'^0+', '', patient_id)
+    if os.environ.get("REMOVE_LEADING_PATIENTID_ZEROS") == "true":
+        return regex.sub(r"^0+", "", patient_id)
     else:
         return patient_id
 
@@ -86,7 +86,7 @@ def decompose_sammelmeldung(root: ET.Element, filename: str):
             continue
 
         # remove all Menge_Meldung
-        menge_meldung = patient.find('./{http://www.gekid.de/namespace}Menge_Meldung')
+        menge_meldung = patient.find("./{http://www.gekid.de/namespace}Menge_Meldung")
         if menge_meldung is not None:
             patient.remove(menge_meldung)
 
@@ -122,9 +122,7 @@ def decompose_sammelmeldung(root: ET.Element, filename: str):
                 f = BytesIO()
                 tree = ET.ElementTree(meldung_root)
                 ET.indent(tree, "  ")
-                tree.write(
-                    f, encoding="utf-8", xml_declaration=True
-                )
+                tree.write(f, encoding="utf-8", xml_declaration=True)
 
                 xml_str = f.getvalue().decode()
                 # prepare json files for kafka bridge
@@ -150,10 +148,10 @@ def decompose_sammelmeldung(root: ET.Element, filename: str):
                         json.dump(result_data, output_file, indent=4)
 
                 save_xml_files(
-                        meldung_root,
-                        patient_id,
-                        meldung_id,
-                    )
+                    meldung_root,
+                    patient_id,
+                    meldung_id,
+                )
 
                 if kafka_producer is not None:
                     kafka_producer.poll(0)

diff --git a/src/decompose_xmls/decompose_xmls_test.py b/src/decompose_xmls/decompose_xmls_test.py
@@ -10,7 +10,7 @@
     "obds_input_file_path", [("input-obds-reports/test-2patients.xml")]
 )
 def test_decompose_sammelmeldung(snapshot, obds_input_file_path):
-    os.environ['REMOVE_LEADING_PATIENTID_ZEROS'] = 'false'
+    os.environ["REMOVE_LEADING_PATIENTID_ZEROS"] = "false"
     tree = ET.parse(obds_input_file_path)
     root = tree.getroot()
 
@@ -22,7 +22,7 @@ def test_decompose_sammelmeldung(snapshot, obds_input_file_path):
     "obds_input_file_path", [("input-obds-reports/test-patientid-with-zeros.xml")]
 )
 def test_decompose_with_pathient_id_starting_with_zero(snapshot, obds_input_file_path):
-    os.environ['REMOVE_LEADING_PATIENTID_ZEROS'] = 'true'
+    os.environ["REMOVE_LEADING_PATIENTID_ZEROS"] = "true"
     tree = ET.parse(obds_input_file_path)
     root = tree.getroot()
 
@@ -34,7 +34,7 @@ def test_decompose_with_pathient_id_starting_with_zero(snapshot, obds_input_file
     "obds_input_file_path", [("input-obds-reports/test-patientid-with-zeros.xml")]
 )
 def test_decompose_keep_pathient_id_starting_with_zero(snapshot, obds_input_file_path):
-    os.environ['REMOVE_LEADING_PATIENTID_ZEROS'] = 'false'
+    os.environ["REMOVE_LEADING_PATIENTID_ZEROS"] = "false"
 
     tree = ET.parse(obds_input_file_path)
     root = tree.getroot()

diff --git a/src/obds_fhir_to_opal/DataSHIELD_R-Script/entities_gender.R b/src/obds_fhir_to_opal/DataSHIELD_R-Script/entities_gender.R
@@ -0,0 +1,134 @@
+install.packages("DSI")
+install.packages("DSOpal", dependencies = TRUE)
+install.packages(
+  "dsBaseClient",
+  repos = c(getOption("repos"),
+            "http://cran.datashield.org"),
+  dependencies = TRUE
+)
+install.packages("languageserver")
+
+library(dsBaseClient)
+library(DSOpal)
+library(DSI)
+library(sys)
+
+install.packages("formatR", repos = "http://cran.rstudio.com")
+library(formatR)
+
+
+# load utils.R file
+source("utils_entities_gender.R")
+
+# Login
+result <- ds_login()
+logindata <- result$logindata
+connections <- result$connections
+
+
+#### investigate size of df, colnames of df
+ds.dim("D")
+ds.colnames("D")
+
+
+#### check gender_mapped values where "other"
+#### create subsets - 1 = female, 2 = male, 3 = other/diverse
+
+for (loc in 1:seq_along(connections)) {
+  tryCatch(
+    ds.dataFrameSubset(
+      "D",
+      "D$gender_mapped",
+      "3",
+      "==",
+      newobj = "D1",
+      datasources = connections[loc]
+    ),
+    error = function(cond) {
+      warning(
+        "subset gendermapped=
+   other/diverse could not be generated in ",
+        names(connections[loc]),
+        ". Not enough cases?"
+      )
+    }
+  )
+}
+ds.dim("D1", datasources = connections[1])
+
+
+#### optional: additional filter D date_diagnosis_year == 2022 here to assure
+# only data from 2022
+ds.dataFrameSubset(
+  df.name = "D",
+  V1.name = "D$date_diagnosis_year",
+  V2.name = "2022",
+  Boolean.operator = "==",
+  newobj = "D",
+  datasources = connections
+)
+ds.dim("D")
+
+
+#### STEP 1: generate subsets for relevant diagnoses
+# generate_subsets: detailed description in utils_entities_gender.R
+colname <- "icd10_grouped_entities"
+start_subset <- 0
+end_subset <- 23
+connections <- connections
+
+generate_subsets(colname, start_subset, end_subset, connections)
+
+# useful: list all Dataframes on Server side - check which subsets you created
+all_loc_dfs <- get_all_loc_dfs()
+all_loc_dfs
+
+#### STEP 2: create a list of all relevant subset names
+icd_group_list <-
+  paste0("subset_", colname, "_", start_subset:end_subset)
+icd_group_list
+
+#### STEP 3:
+# subset_prevalence_by_gender: detailed description in utils_entities_gender.R
+subset_by_gender_result <-
+  subset_prevalence_by_gender(icd_group_list, all_loc_dfs, connections)
+
+#### change row names
+new_rownames_eng <- c(
+  "Lip, Oral Cavity, Pharynx (C00-C14)",
+  "Oesophagus (C15)",
+  "Stomach (C16)",
+  "Colon and Rectum (C18-C21)",
+  "Liver (C22)",
+  "Gallbladder and Biliary Tract (C23-C24)",
+  "Pancreas (C25)",
+  "Larynx (C32)",
+  "Trachea, Bronchus and Lungs (C33-C34)",
+  "Malignant Melanoma of Skin (C43)",
+  "Breast (C50, D05)",
+  "Cervis Uteri (C53, D06)",
+  "Corpus Uteri (C54-C55)",
+  "Ovary (C56, D39.1)",
+  "Prostate (C61)",
+  "Testis (C62)",
+  "Kidney (C64)",
+  "Bladder (C67, D09.0, D41.4)",
+  "Brain and Central Nervous System (C70-C72)",
+  "Thyroid (C73)",
+  "Hodgkin Lymphoma (C81)",
+  "Non-Hodgkin Lymphoma (C82-C88, C96)",
+  "Plasmacytoma (C90)",
+  "Leukaemia (C91-C95)"
+)
+
+rownames(subset_by_gender_result) <- new_rownames_eng
+subset_by_gender_result
+
+
+# write to csv file
+write.csv(subset_by_gender_result,
+          file = "subset_by_gender_result_matrix.csv",
+          row.names = TRUE)
+
+# clear DataSHIELD R-Session and logout
+datashield.logout(connections)