Merge branch 'master' into piotr/harvard

uc-cdis · Jan 30, 2025 · 5a8ce80 · 5a8ce80
2 parents b4f00a7 + 7c7cf79
commit 5a8ce80
Show file tree

Hide file tree

Showing 12 changed files with 333 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -17,11 +17,13 @@ These notebooks perform optimally within a HEAL Gen3 Workspace and the notebooks
 
 ### VLMD extraction and validation
 
-The [VLMD docs](heal/vlmd/README.md) describe how to use the SDK for extracting and validating VLMD dictionaries.
+
+The [VLMD documentation](heal/vlmd/README.md) describes how to use the SDK for extracting and validating VLMD dictionaries.
+
 
 ### Run tests
 
-```
+```bash
 poetry run pytest -vv tests
 ```
 
@@ -33,19 +35,38 @@ reference the git repo.
 As an example, `pip install` can be called from the command line for getting
 the master branch of the `heal-platform-sdk`,
 
-```
+```bash
 pip install -e git+https://github.com/uc-cdis/heal-platform-sdk.git#egg=heal
 ```
 
 or a particular branch, eg,
 
-```
+```bash
 pip install -e git+https://github.com/uc-cdis/heal-platform-sdk.git@my-branch#egg=heal
 ```
 
 The specification can also be listed in requirements.txt file
 (with, say, a tag specification of 0.1.0)
 
-```
+```bash
 pip install -e git+https://github.com/uc-cdis/[email protected]#egg=heal
 ```
+
+### CLI
+
+The SDK exposes a Command Line Interface (CLI) for some functions.
+
+The CLI can be invoked as follows
+
+`heal [OPTIONS] COMMAND [ARGS]`
+
+For a list of commands and options run
+
+`heal --help`
+
+For example, the following can validate a VLMD file in csv format:
+
+`heal vlmd validate --input_file "vlmd_for_validation.csv"`
+
+The [VLMD documentation](heal/vlmd/README.md) provides information on
+using the VLMD functions, such as `extract` and `validate`.
diff --git a/heal/cli/__init__.py b/heal/cli/__init__.py
diff --git a/heal/cli/extract.py b/heal/cli/extract.py
@@ -0,0 +1,33 @@
+import click
+from cdislogging import get_logger
+
+from heal.vlmd.extract.extract import vlmd_extract
+
+logging = get_logger("__name__")
+
+
+@click.command()
+@click.option(
+    "--input_file",
+    "input_file",
+    required=True,
+    help="name of file to extract HEAL-compliant VLMD file",
+    type=click.Path(writable=True),
+)
+@click.option(
+    "--output_dir",
+    "output_dir",
+    help="directory to write converted dictionary'",
+    default=".",
+    type=click.Path(writable=True),
+    show_default=True,
+)
+def extract(input_file, output_dir):
+    """Extract HEAL-compliant VLMD file from input file"""
+
+    logging.info(f"Extracting VLMD from {input_file}")
+
+    try:
+        vlmd_extract(input_file, output_dir=output_dir)
+    except Exception as e:
+        logging.error(f"Extraction error {str(e)}")
diff --git a/heal/cli/heal_cli.py b/heal/cli/heal_cli.py
@@ -0,0 +1,32 @@
+import logging
+
+import cdislogging
+import click
+
+import heal.cli.vlmd as vlmd
+
+
+@click.group()
+@click.option(
+    "--silent",
+    "silent",
+    is_flag=True,
+    default=False,
+    help="don't show ANY logs",
+)
+@click.pass_context
+def main(ctx, silent):
+    """HEAL-Platform SDK Command Line Interface"""
+    ctx.ensure_object(dict)
+
+    if silent:
+        # we still need to define the logger, the log_level here doesn't
+        # really matter b/c we immediately disable all logging
+        logger = cdislogging.get_logger("heal_cli", log_level="debug")
+        # disables all logging
+        logging.disable(logging.CRITICAL)
+
+
+main.add_command(vlmd.vlmd)
+if __name__ == "__main__":
+    main()
diff --git a/heal/cli/validate.py b/heal/cli/validate.py
@@ -0,0 +1,27 @@
+import click
+from cdislogging import get_logger
+
+from heal.vlmd.validate.validate import vlmd_validate
+
+logging = get_logger("__name__")
+
+
+@click.command()
+@click.option(
+    "--input_file",
+    "input_file",
+    required=True,
+    help="name of file to validate",
+    type=click.Path(writable=True),
+)
+def validate(input_file):
+    """Validate VLMD input file"""
+
+    logging.info(f"Validating VLMD file{input_file}")
+
+    try:
+        vlmd_validate(input_file)
+        logging.info("Valid")
+    except Exception as e:
+        logging.error(f"Validation error {str(e)}")
+        logging.error("Invalid file")
diff --git a/heal/cli/vlmd.py b/heal/cli/vlmd.py
@@ -0,0 +1,19 @@
+import click
+
+from heal.cli import extract, validate
+
+
+@click.group()
+def main():
+    """HEAL Command Line Interface"""
+    pass
+
+
+@click.group()
+def vlmd():
+    """Commands for VLMD"""
+    pass
+
+
+vlmd.add_command(extract.extract)
+vlmd.add_command(validate.validate)
diff --git a/heal/vlmd/README.md b/heal/vlmd/README.md
@@ -1,5 +1,31 @@
 # VLMD methods
 
+## VLMD extract
+
+The extract module implements extraction and conversion of dictionaries into different formats.
+
+The current formats are csv, json, and tsv.
+
+The `vlmd_extract()` method raises a `jsonschema.ValidationError` for an invalid input files and raises
+`ExtractionError` for any other type of error.
+
+Example extraction code:
+
+```python
+from jsonschema import ValidationError
+
+from healsdk.vlmd import vlmd_extract
+
+try:
+  vlmd_extract("vlmd_for_extraction.csv", output_dir="./output")
+
+except ValidationError as v_err:
+  # handle validation error
+
+except ExtractionError as e_err:
+  # handle extraction error
+```
+
 ## VLMD validation
 
 This module validates VLMD data dictionaries against stored schemas. The `vlmd_validate()` method
@@ -10,7 +36,7 @@ will raise an `ExtractionError` if the input_file cannot be converted
 
 Example validation code:
 
-```
+```python
 from jsonschema import ValidationError
 
 from heal.vlmd import vlmd_validate, ExtractionError
@@ -38,7 +64,7 @@ and raises an `ExtractionError` for any other type of error.
 
 Example extraction code:
 
-```
+```python
 from jsonschema import ValidationError
 
 from heal.vlmd import vlmd_extract, ExtractionError
@@ -70,3 +96,23 @@ To add code for a new dictionary file type:
 * Call the new extractor module from the `conversion.py` module
 * Add new file writing utilities if saving converted dictionaries in the new format
 * Create unit tests as needed for new code
+
+
+## CLI
+
+The CLI can be invoked as follows
+
+`heal [OPTIONS] COMMAND [ARGS]`
+
+For a list of VLMD commands and options run
+
+`heal vlmd --help`
+
+For example, the following can validate a VLMD file in csv format:
+
+`heal vlmd validate --input_file "vlmd_for_validation.csv"`
+
+The following would extract a json format VLMD file from a csv format input file and
+write a json file in the directory `output`:
+
+`heal vlmd extract --input_file "vlmd_for_extraction.csv" --output_dir "./output"`
diff --git a/heal/vlmd/extract/extract.py b/heal/vlmd/extract/extract.py
@@ -42,19 +42,25 @@ def vlmd_extract(
 
     file_suffix = Path(input_file).suffix.replace(".", "")
     if file_suffix not in ALLOWED_INPUT_TYPES:
-        raise ExtractionError(f"Input file must be one of {ALLOWED_INPUT_TYPES}")
+        message = f"Input file must be one of {ALLOWED_INPUT_TYPES}"
+        logger.error(message)
+        raise ExtractionError(message)
     if not isfile(input_file):
-        raise ExtractionError(f"Input file does not exist: {input_file}")
+        message = f"Input file does not exist: {input_file}"
+        logger.error(message)
+        raise ExtractionError(message)
 
     if file_type not in ALLOWED_FILE_TYPES:
-        raise ExtractionError(f"File type must be one of {ALLOWED_FILE_TYPES}")
+        message = f"File type must be one of {ALLOWED_FILE_TYPES}"
+        logger.error(message)
+        raise ExtractionError(message)
     if file_type == "auto":
         file_type = file_suffix
 
     if output_type not in ALLOWED_OUTPUT_TYPES:
-        raise ExtractionError(
-            f"Unrecognized output_type '{output_type}' - should be in {ALLOWED_OUTPUT_TYPES}"
-        )
+        message = f"Unrecognized output_type '{output_type}' - should be in {ALLOWED_OUTPUT_TYPES}"
+        logger.error(message)
+        raise ExtractionError(message)
 
     # validate
     try:

diff --git a/heal/vlmd/validate/validate.py b/heal/vlmd/validate/validate.py
@@ -62,21 +62,29 @@ def vlmd_validate(
     )
     file_suffix = Path(input_file).suffix.replace(".", "")
     if file_suffix not in ALLOWED_INPUT_TYPES:
-        raise ValueError(f"Input file must be one of {ALLOWED_INPUT_TYPES}")
+        message = f"Input file must be one of {ALLOWED_INPUT_TYPES}"
+        logger.error(message)
+        raise ValueError(message)
     if not isfile(input_file):
-        raise IOError(f"Input file does not exist: {input_file}")
+        message = f"Input file does not exist: {input_file}"
+        logger.error(message)
+        raise IOError(message)
 
     if schema_type not in ALLOWED_SCHEMA_TYPES:
-        raise ValueError(f"Schema type must be in {ALLOWED_SCHEMA_TYPES}")
+        message = f"Schema type must be in {ALLOWED_SCHEMA_TYPES}"
+        logger.error(message)
+        raise ValueError(message)
     schema = get_schema(input_file, schema_type)
     if schema is None:
-        raise ValueError(f"Could not get schema for type = {schema_type}")
+        message = f"Could not get schema for type = {schema_type}"
+        logger.error(message)
+        raise ValueError(message)
 
     output_type = output_type if output_type else "json"
     if output_type not in ALLOWED_OUTPUT_TYPES:
-        raise ValueError(
-            f"Unrecognized output_type '{output_type}' - should be in {ALLOWED_OUTPUT_TYPES}"
-        )
+        message = f"Unrecognized output_type '{output_type}' - should be in {ALLOWED_OUTPUT_TYPES}"
+        logger.error(message)
+        raise ValueError(message)
 
     # TODO: We need this for csv - see if we can add this to get_schema
     if file_suffix in ["csv", "tsv"]:
@@ -89,7 +97,9 @@ def vlmd_validate(
         logger.debug("Getting csv data from file")
         data = read_delim(input_file).to_dict(orient="records")
         if len(data) == 0:
-            raise ValidationError("Could not read csv data from input")
+            message = "Could not read csv data from input"
+            logger.error(message)
+            raise ValidationError(message)
     elif file_suffix == "json":
         logger.debug("Getting json data from file")
         data = read_data_from_json_file(input_file)
@@ -106,9 +116,9 @@ def vlmd_validate(
     # convert
     input_type = file_type_to_fxn_map.get(file_suffix)
     if not input_type:
-        raise ExtractionError(
-            f"Could not get conversion function from file_suffix '{file_suffix}'"
-        )
+        message = f"Could not get conversion function from file_suffix '{file_suffix}'"
+        logger.error(message)
+        raise ExtractionError(message)
     data_dictionaries = {}
     logger.debug(f"Verifying vlmd can be converted using input_type '{input_type}'")
     data_dictionary_props = {}
@@ -139,7 +149,9 @@ def vlmd_validate(
             # TODO: see if we can add this to get_schema
             schema = add_types_to_props(schema)
         if schema is None:
-            raise ValueError(f"Could not get schema for type = {schema_type}")
+            message = f"Could not get schema for type = {schema_type}"
+            logger.error(message)
+            raise ValueError(message)
 
     try:
         jsonschema.validate(instance=converted_dictionary, schema=schema)

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,9 @@ pytest = "^7.0.0"
 pytest-cov = "*"
 requests-mock = "*"
 
+[tool.poetry.scripts]
+heal = "heal.cli.heal_cli:main"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,4 +1,5 @@
 import json
+
 import pytest
 
 from heal.vlmd.config import (