uc-cdis · george42-ctds · Oct 24, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/README.md b/README.md
@@ -15,9 +15,9 @@ In the notebooks directory there are jupyter notebooks that may be used to downl
 
 These notebooks perform optimally within a HEAL Gen3 Workspace and the notebooks will be automatically installed to a user's workspace when the workspace is initiated. However, you may also use these notebooks on your local machine.
 
-### VLMD validation
+### VLMD extraction and validation
 
-The [VLMD validation docs](heal/vlmd/README.md) describe how to use the SDK for validating VLMD dictionaries.
+The [VLMD docs](heal/vlmd/README.md) describe how to use the SDK for extracting and validating VLMD dictionaries.
 
 ### Run tests
 

diff --git a/heal/vlmd/README.md b/heal/vlmd/README.md
@@ -1,5 +1,31 @@
 # VLMD methods
 
+## VLMD extract
+
+The extract module implements extraction and conversion of dictionaries into different formats.
+
+The current formats are csv, json, and tsv.
+
+The `vlmd_extract()` method raises a `jsonschema.ValidationError` for an invalid input files and raises
+`ExtractionError` for any other type of error.
+
+Example extraction code:
+
+```
+from jsonschema import ValidationError
+
+from healsdk.vlmd import vlmd_extract
+
+try:
+  vlmd_extract("vlmd_for_extraction.csv", output_dir="./output")
+
+except ValidationError as v_err:
+  # handle validation error
+
+except ExtractionError as e_err:
+  # handle extraction error
+```
+
 ## VLMD validation
 
 This module validates VLMD data dictionaries against stored schemas.
@@ -21,13 +47,16 @@ except ValidationError as e:
 
 ```
 
-### Adding new validators
+## Adding new file types for extraction and validation
 
-The module currently validates the following types of dictionaries: csv, json, tsv.
+The above moduels currently handle the following types of dictionaries: csv, json, tsv.
 
 To add code for a new dictionary file type:
 
 * Create a new schema for the data type or validate against the existing json schema
 * Create a new validator module for the new file type
-* Call the new module from the `validator.py` module
+* Call the new validator module from the `validator.py` module
+* Create a new extractor module for the new file type
+* Call the new extractor module from the `conversion.py` module
+* Add new file writing utilities if needed
 * Create unit tests as needed for new code
diff --git a/heal/vlmd/__init__.py b/heal/vlmd/__init__.py
@@ -1 +1,4 @@
 from heal.vlmd.validate.validate import vlmd_validate
+
+# place 'extract' import after 'validate' import
+from heal.vlmd.extract.extract import vlmd_extract
diff --git a/heal/vlmd/config.py b/heal/vlmd/config.py
@@ -1,12 +1,24 @@
 import json
 
+# file prefix
+OUTPUT_FILE_PREFIX = "heal-dd"
+
+# file suffixes
 ALLOWED_INPUT_TYPES = ["csv", "tsv", "json"]
+ALLOWED_FILE_TYPES = ["auto"] + ALLOWED_INPUT_TYPES
 ALLOWED_SCHEMA_TYPES = ["auto", "csv", "json", "tsv"]
 
+# schemas
 csv_schema_file = "heal/vlmd/schemas/heal_csv.json"
 with open(csv_schema_file, "r") as f:
     CSV_SCHEMA = json.load(f)
 
 json_schema_file = "heal/vlmd/schemas/heal_json.json"
 with open(json_schema_file, "r") as f:
     JSON_SCHEMA = json.load(f)
+
+# schema
+TOP_LEVEL_PROPS = {
+    "schemaVersion": JSON_SCHEMA.get("version", "0.3.2"),
+    "title": "HEAL Data Dictionary",
+}
diff --git a/heal/vlmd/extract/conversion.py b/heal/vlmd/extract/conversion.py
@@ -0,0 +1,92 @@
+from functools import partial
+from pathlib import Path
+
+from cdislogging import get_logger
+from heal.vlmd.config import JSON_SCHEMA, TOP_LEVEL_PROPS
+from heal.vlmd import mappings
+from heal.vlmd.extract.csv_dict_conversion import convert_datadictcsv
+from heal.vlmd.extract.json_dict_conversion import convert_templatejson
+from heal.vlmd.utils import remove_empty_props
+
+
+logger = get_logger("vlmd-conversion", log_level="debug")
+
+choice_fxn = {
+    "csv-data-dict": partial(
+        convert_datadictcsv, renamemap=mappings.renamemap, recodemap=mappings.recodemap
+    ),
+    "json-template": convert_templatejson,
+}
+
+ext_map = {
+    ".csv": "csv-data-dict",
+    ".json": "json-template",
+}
+
+
+def _detect_input_type(filepath, ext_to_input_type=ext_map):
+    ext = filepath.suffix
+    input_type = ext_to_input_type.get(ext, None)
+    return input_type
+
+
+def convert_to_vlmd(
+    input_filepath,
+    input_type=None,
+    data_dictionary_props=None,
+):
+    """
+    Converts a data dictionary to HEAL compliant json or csv format.
+
+    Args
+        input_filepath (str): Path to input file. Currently converts data dictionaries in csv, json, and tsv.
+        input_type (str): The input type. See keys of 'choice_fxn' dict for options, currently:
+            csv-data-dict, json-template.
+        data_dictionary_props (dict):
+            The other data-dictionary level properties. By default, will give the data_dictionary `title` property as the file name stem.
+
+    Returns
+        Dictionary with:
+         1. csvtemplated array of fields.
+         2. jsontemplated data dictionary object as specified by an originally drafted design doc.
+            That is, a dictionary with title:<title>,description:<description>,data_dictionary:<fields>
+            where data dictionary is an array of fields as specified by the JSON schema.
+
+    """
+
+    input_filepath = Path(input_filepath)
+
+    input_type = input_type or _detect_input_type(input_filepath)
+    logger.debug(f"Converting file '{input_filepath}' of input_type '{input_type}'")
+    if input_type not in choice_fxn.keys():
+        logger.error(f"Unexpected input type {input_type}")
+        raise ValueError(
+            f"Unexpected input_type '{input_type}', not in {choice_fxn.keys()}"
+        )
+
+    # get data dictionary package based on the input type
+    data_dictionary_props = data_dictionary_props or {}
+    data_dictionary_package = choice_fxn[input_type](
+        input_filepath, data_dictionary_props
+    )
+    logger.debug(f"Data Dictionary Package keys {data_dictionary_package.keys()}")
+
+    # For now we return the csv and json in one package.
+    # If any multiple data dictionaries are needed then implement the methods in
+    # https://github.com/HEAL/healdata-utils/blob/5080227454d8e731d46a51aa6933c93523eb3b9a/src/healdata_utils/conversion.py#L196
+    package = data_dictionary_package
+
+    # add schema version
+    for field in package["templatecsv"]["fields"]:
+        field.update({"schemaVersion": JSON_SCHEMA["version"], **field})
+
+    # remove empty json fields add schema version (in TOP_LEVEL_PROPS)
+    cleaned_fields = []
+    for field in package["templatejson"]["fields"]:
+        new_field = remove_empty_props(field)
+        cleaned_fields.append(new_field)
+    package["templatejson"]["fields"] = cleaned_fields
+
+    package["templatejson"] = {**TOP_LEVEL_PROPS, **dict(package["templatejson"])}
+
+    return package