Skip to content

Commit

Permalink
refactor: obds-fhir-to-opal with focus on first POC FA for publication (
Browse files Browse the repository at this point in the history
#198)

* refactor obds-fhir-to-opal with focus on first POC FA for publication

* megalinter fixes

* test utils

* megalinter happy

* megalinter please be happy

* adds readme

* changed volume location in container to match the one in source code

* add plotting nbs

* use math case pattern for map_gender

* adressed megalinter complaints

* formatted decompose_xml

* fix bandit

* fix iteration in R with seq_along

---------

Co-authored-by: chgl <[email protected]>
  • Loading branch information
jasminziegler and chgl authored Aug 19, 2024
1 parent 4c26c98 commit dae745b
Show file tree
Hide file tree
Showing 21 changed files with 1,550 additions and 628 deletions.
4 changes: 2 additions & 2 deletions .bandit.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
skips:
- "B405"
- "B314"
exclude_dirs:
- "*_test.py"
assert_used:
skips: ["**/test_*.py", "**/*_test.py"]
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ bundles-delta/
*.csv

.vscode

.venv/
1 change: 1 addition & 0 deletions .mega-linter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ DISABLE_LINTERS:
# due to import-error
- PYTHON_PYLINT
- SPELL_LYCHEE
- R_LINTR

SHOW_ELAPSED_TIME: true

Expand Down
2 changes: 1 addition & 1 deletion docker-compose/compose.obds-fhir-to-opal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ services:
KAFKA_PROCEDURE_TOPIC: "fhir.obds.Procedure"
KAFKA_MEDICATIONSTATEMENT_TOPIC: "fhir.obds.MedicationStatement"
volumes:
- ${PWD}/opal-output:/opt/bitnami/spark/opal-output
- ${PWD}/opal-output:/home/spark/opal-output
20 changes: 9 additions & 11 deletions src/decompose_xmls/decompose_xmls.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import json
import os
import time
import regex
import xml.etree.ElementTree as ET
from io import BytesIO

import regex
from confluent_kafka import Producer
from pydantic import BaseSettings

Expand Down Expand Up @@ -48,8 +48,8 @@ def kafka_delivery_report(err, msg):


def remove_leading_zeros(patient_id: str) -> str:
if os.environ.get('REMOVE_LEADING_PATIENTID_ZEROS') == 'true':
return regex.sub(r'^0+', '', patient_id)
if os.environ.get("REMOVE_LEADING_PATIENTID_ZEROS") == "true":
return regex.sub(r"^0+", "", patient_id)
else:
return patient_id

Expand Down Expand Up @@ -86,7 +86,7 @@ def decompose_sammelmeldung(root: ET.Element, filename: str):
continue

# remove all Menge_Meldung
menge_meldung = patient.find('./{http://www.gekid.de/namespace}Menge_Meldung')
menge_meldung = patient.find("./{http://www.gekid.de/namespace}Menge_Meldung")
if menge_meldung is not None:
patient.remove(menge_meldung)

Expand Down Expand Up @@ -122,9 +122,7 @@ def decompose_sammelmeldung(root: ET.Element, filename: str):
f = BytesIO()
tree = ET.ElementTree(meldung_root)
ET.indent(tree, " ")
tree.write(
f, encoding="utf-8", xml_declaration=True
)
tree.write(f, encoding="utf-8", xml_declaration=True)

xml_str = f.getvalue().decode()
# prepare json files for kafka bridge
Expand All @@ -150,10 +148,10 @@ def decompose_sammelmeldung(root: ET.Element, filename: str):
json.dump(result_data, output_file, indent=4)

save_xml_files(
meldung_root,
patient_id,
meldung_id,
)
meldung_root,
patient_id,
meldung_id,
)

if kafka_producer is not None:
kafka_producer.poll(0)
Expand Down
6 changes: 3 additions & 3 deletions src/decompose_xmls/decompose_xmls_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"obds_input_file_path", [("input-obds-reports/test-2patients.xml")]
)
def test_decompose_sammelmeldung(snapshot, obds_input_file_path):
os.environ['REMOVE_LEADING_PATIENTID_ZEROS'] = 'false'
os.environ["REMOVE_LEADING_PATIENTID_ZEROS"] = "false"
tree = ET.parse(obds_input_file_path)
root = tree.getroot()

Expand All @@ -22,7 +22,7 @@ def test_decompose_sammelmeldung(snapshot, obds_input_file_path):
"obds_input_file_path", [("input-obds-reports/test-patientid-with-zeros.xml")]
)
def test_decompose_with_pathient_id_starting_with_zero(snapshot, obds_input_file_path):
os.environ['REMOVE_LEADING_PATIENTID_ZEROS'] = 'true'
os.environ["REMOVE_LEADING_PATIENTID_ZEROS"] = "true"
tree = ET.parse(obds_input_file_path)
root = tree.getroot()

Expand All @@ -34,7 +34,7 @@ def test_decompose_with_pathient_id_starting_with_zero(snapshot, obds_input_file
"obds_input_file_path", [("input-obds-reports/test-patientid-with-zeros.xml")]
)
def test_decompose_keep_pathient_id_starting_with_zero(snapshot, obds_input_file_path):
os.environ['REMOVE_LEADING_PATIENTID_ZEROS'] = 'false'
os.environ["REMOVE_LEADING_PATIENTID_ZEROS"] = "false"

tree = ET.parse(obds_input_file_path)
root = tree.getroot()
Expand Down
134 changes: 134 additions & 0 deletions src/obds_fhir_to_opal/DataSHIELD_R-Script/entities_gender.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
install.packages("DSI")
install.packages("DSOpal", dependencies = TRUE)
install.packages(
"dsBaseClient",
repos = c(getOption("repos"),
"http://cran.datashield.org"),
dependencies = TRUE
)
install.packages("languageserver")

library(dsBaseClient)
library(DSOpal)
library(DSI)
library(sys)

install.packages("formatR", repos = "http://cran.rstudio.com")
library(formatR)


# load utils.R file
source("utils_entities_gender.R")

# Login
result <- ds_login()
logindata <- result$logindata
connections <- result$connections


#### investigate size of df, colnames of df
ds.dim("D")
ds.colnames("D")


#### check gender_mapped values where "other"
#### create subsets - 1 = female, 2 = male, 3 = other/diverse

for (loc in 1:seq_along(connections)) {
tryCatch(
ds.dataFrameSubset(
"D",
"D$gender_mapped",
"3",
"==",
newobj = "D1",
datasources = connections[loc]
),
error = function(cond) {
warning(
"subset gendermapped=
other/diverse could not be generated in ",
names(connections[loc]),
". Not enough cases?"
)
}
)
}
ds.dim("D1", datasources = connections[1])


#### optional: additional filter D date_diagnosis_year == 2022 here to assure
# only data from 2022
ds.dataFrameSubset(
df.name = "D",
V1.name = "D$date_diagnosis_year",
V2.name = "2022",
Boolean.operator = "==",
newobj = "D",
datasources = connections
)
ds.dim("D")


#### STEP 1: generate subsets for relevant diagnoses
# generate_subsets: detailed description in utils_entities_gender.R
colname <- "icd10_grouped_entities"
start_subset <- 0
end_subset <- 23
connections <- connections

generate_subsets(colname, start_subset, end_subset, connections)

# useful: list all Dataframes on Server side - check which subsets you created
all_loc_dfs <- get_all_loc_dfs()
all_loc_dfs

#### STEP 2: create a list of all relevant subset names
icd_group_list <-
paste0("subset_", colname, "_", start_subset:end_subset)
icd_group_list

#### STEP 3:
# subset_prevalence_by_gender: detailed description in utils_entities_gender.R
subset_by_gender_result <-
subset_prevalence_by_gender(icd_group_list, all_loc_dfs, connections)

#### change row names
new_rownames_eng <- c(
"Lip, Oral Cavity, Pharynx (C00-C14)",
"Oesophagus (C15)",
"Stomach (C16)",
"Colon and Rectum (C18-C21)",
"Liver (C22)",
"Gallbladder and Biliary Tract (C23-C24)",
"Pancreas (C25)",
"Larynx (C32)",
"Trachea, Bronchus and Lungs (C33-C34)",
"Malignant Melanoma of Skin (C43)",
"Breast (C50, D05)",
"Cervis Uteri (C53, D06)",
"Corpus Uteri (C54-C55)",
"Ovary (C56, D39.1)",
"Prostate (C61)",
"Testis (C62)",
"Kidney (C64)",
"Bladder (C67, D09.0, D41.4)",
"Brain and Central Nervous System (C70-C72)",
"Thyroid (C73)",
"Hodgkin Lymphoma (C81)",
"Non-Hodgkin Lymphoma (C82-C88, C96)",
"Plasmacytoma (C90)",
"Leukaemia (C91-C95)"
)

rownames(subset_by_gender_result) <- new_rownames_eng
subset_by_gender_result


# write to csv file
write.csv(subset_by_gender_result,
file = "subset_by_gender_result_matrix.csv",
row.names = TRUE)

# clear DataSHIELD R-Session and logout
datashield.logout(connections)
Loading

0 comments on commit dae745b

Please sign in to comment.