Skip to content

Commit

Permalink
Merge branch 'master' into piotr/harvard
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrsenkow authored Jan 30, 2025
2 parents b4f00a7 + 7c7cf79 commit 5a8ce80
Show file tree
Hide file tree
Showing 12 changed files with 333 additions and 25 deletions.
31 changes: 26 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@ These notebooks perform optimally within a HEAL Gen3 Workspace and the notebooks

### VLMD extraction and validation

The [VLMD docs](heal/vlmd/README.md) describe how to use the SDK for extracting and validating VLMD dictionaries.

The [VLMD documentation](heal/vlmd/README.md) describes how to use the SDK for extracting and validating VLMD dictionaries.


### Run tests

```
```bash
poetry run pytest -vv tests
```

Expand All @@ -33,19 +35,38 @@ reference the git repo.
As an example, `pip install` can be called from the command line for getting
the master branch of the `heal-platform-sdk`,

```
```bash
pip install -e git+https://github.com/uc-cdis/heal-platform-sdk.git#egg=heal
```

or a particular branch, eg,

```
```bash
pip install -e git+https://github.com/uc-cdis/heal-platform-sdk.git@my-branch#egg=heal
```

The specification can also be listed in requirements.txt file
(with, say, a tag specification of 0.1.0)

```
```bash
pip install -e git+https://github.com/uc-cdis/[email protected]#egg=heal
```

### CLI

The SDK exposes a Command Line Interface (CLI) for some functions.

The CLI can be invoked as follows

`heal [OPTIONS] COMMAND [ARGS]`

For a list of commands and options run

`heal --help`

For example, the following can validate a VLMD file in csv format:

`heal vlmd validate --input_file "vlmd_for_validation.csv"`

The [VLMD documentation](heal/vlmd/README.md) provides information on
using the VLMD functions, such as `extract` and `validate`.
Empty file added heal/cli/__init__.py
Empty file.
33 changes: 33 additions & 0 deletions heal/cli/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import click
from cdislogging import get_logger

from heal.vlmd.extract.extract import vlmd_extract

logging = get_logger("__name__")


@click.command()
@click.option(
"--input_file",
"input_file",
required=True,
help="name of file to extract HEAL-compliant VLMD file",
type=click.Path(writable=True),
)
@click.option(
"--output_dir",
"output_dir",
help="directory to write converted dictionary'",
default=".",
type=click.Path(writable=True),
show_default=True,
)
def extract(input_file, output_dir):
"""Extract HEAL-compliant VLMD file from input file"""

logging.info(f"Extracting VLMD from {input_file}")

try:
vlmd_extract(input_file, output_dir=output_dir)
except Exception as e:
logging.error(f"Extraction error {str(e)}")
32 changes: 32 additions & 0 deletions heal/cli/heal_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import logging

import cdislogging
import click

import heal.cli.vlmd as vlmd


@click.group()
@click.option(
"--silent",
"silent",
is_flag=True,
default=False,
help="don't show ANY logs",
)
@click.pass_context
def main(ctx, silent):
"""HEAL-Platform SDK Command Line Interface"""
ctx.ensure_object(dict)

if silent:
# we still need to define the logger, the log_level here doesn't
# really matter b/c we immediately disable all logging
logger = cdislogging.get_logger("heal_cli", log_level="debug")
# disables all logging
logging.disable(logging.CRITICAL)


main.add_command(vlmd.vlmd)
if __name__ == "__main__":
main()
27 changes: 27 additions & 0 deletions heal/cli/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import click
from cdislogging import get_logger

from heal.vlmd.validate.validate import vlmd_validate

logging = get_logger("__name__")


@click.command()
@click.option(
"--input_file",
"input_file",
required=True,
help="name of file to validate",
type=click.Path(writable=True),
)
def validate(input_file):
"""Validate VLMD input file"""

logging.info(f"Validating VLMD file{input_file}")

try:
vlmd_validate(input_file)
logging.info("Valid")
except Exception as e:
logging.error(f"Validation error {str(e)}")
logging.error("Invalid file")
19 changes: 19 additions & 0 deletions heal/cli/vlmd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import click

from heal.cli import extract, validate


@click.group()
def main():
"""HEAL Command Line Interface"""
pass


@click.group()
def vlmd():
"""Commands for VLMD"""
pass


vlmd.add_command(extract.extract)
vlmd.add_command(validate.validate)
50 changes: 48 additions & 2 deletions heal/vlmd/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,31 @@
# VLMD methods

## VLMD extract

The extract module implements extraction and conversion of dictionaries into different formats.

The current formats are csv, json, and tsv.

The `vlmd_extract()` method raises a `jsonschema.ValidationError` for an invalid input files and raises
`ExtractionError` for any other type of error.

Example extraction code:

```python
from jsonschema import ValidationError

from healsdk.vlmd import vlmd_extract

try:
vlmd_extract("vlmd_for_extraction.csv", output_dir="./output")

except ValidationError as v_err:
# handle validation error

except ExtractionError as e_err:
# handle extraction error
```

## VLMD validation

This module validates VLMD data dictionaries against stored schemas. The `vlmd_validate()` method
Expand All @@ -10,7 +36,7 @@ will raise an `ExtractionError` if the input_file cannot be converted

Example validation code:

```
```python
from jsonschema import ValidationError

from heal.vlmd import vlmd_validate, ExtractionError
Expand Down Expand Up @@ -38,7 +64,7 @@ and raises an `ExtractionError` for any other type of error.

Example extraction code:

```
```python
from jsonschema import ValidationError

from heal.vlmd import vlmd_extract, ExtractionError
Expand Down Expand Up @@ -70,3 +96,23 @@ To add code for a new dictionary file type:
* Call the new extractor module from the `conversion.py` module
* Add new file writing utilities if saving converted dictionaries in the new format
* Create unit tests as needed for new code


## CLI

The CLI can be invoked as follows

`heal [OPTIONS] COMMAND [ARGS]`

For a list of VLMD commands and options run

`heal vlmd --help`

For example, the following can validate a VLMD file in csv format:

`heal vlmd validate --input_file "vlmd_for_validation.csv"`

The following would extract a json format VLMD file from a csv format input file and
write a json file in the directory `output`:

`heal vlmd extract --input_file "vlmd_for_extraction.csv" --output_dir "./output"`
18 changes: 12 additions & 6 deletions heal/vlmd/extract/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,25 @@ def vlmd_extract(

file_suffix = Path(input_file).suffix.replace(".", "")
if file_suffix not in ALLOWED_INPUT_TYPES:
raise ExtractionError(f"Input file must be one of {ALLOWED_INPUT_TYPES}")
message = f"Input file must be one of {ALLOWED_INPUT_TYPES}"
logger.error(message)
raise ExtractionError(message)
if not isfile(input_file):
raise ExtractionError(f"Input file does not exist: {input_file}")
message = f"Input file does not exist: {input_file}"
logger.error(message)
raise ExtractionError(message)

if file_type not in ALLOWED_FILE_TYPES:
raise ExtractionError(f"File type must be one of {ALLOWED_FILE_TYPES}")
message = f"File type must be one of {ALLOWED_FILE_TYPES}"
logger.error(message)
raise ExtractionError(message)
if file_type == "auto":
file_type = file_suffix

if output_type not in ALLOWED_OUTPUT_TYPES:
raise ExtractionError(
f"Unrecognized output_type '{output_type}' - should be in {ALLOWED_OUTPUT_TYPES}"
)
message = f"Unrecognized output_type '{output_type}' - should be in {ALLOWED_OUTPUT_TYPES}"
logger.error(message)
raise ExtractionError(message)

# validate
try:
Expand Down
36 changes: 24 additions & 12 deletions heal/vlmd/validate/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,21 +62,29 @@ def vlmd_validate(
)
file_suffix = Path(input_file).suffix.replace(".", "")
if file_suffix not in ALLOWED_INPUT_TYPES:
raise ValueError(f"Input file must be one of {ALLOWED_INPUT_TYPES}")
message = f"Input file must be one of {ALLOWED_INPUT_TYPES}"
logger.error(message)
raise ValueError(message)
if not isfile(input_file):
raise IOError(f"Input file does not exist: {input_file}")
message = f"Input file does not exist: {input_file}"
logger.error(message)
raise IOError(message)

if schema_type not in ALLOWED_SCHEMA_TYPES:
raise ValueError(f"Schema type must be in {ALLOWED_SCHEMA_TYPES}")
message = f"Schema type must be in {ALLOWED_SCHEMA_TYPES}"
logger.error(message)
raise ValueError(message)
schema = get_schema(input_file, schema_type)
if schema is None:
raise ValueError(f"Could not get schema for type = {schema_type}")
message = f"Could not get schema for type = {schema_type}"
logger.error(message)
raise ValueError(message)

output_type = output_type if output_type else "json"
if output_type not in ALLOWED_OUTPUT_TYPES:
raise ValueError(
f"Unrecognized output_type '{output_type}' - should be in {ALLOWED_OUTPUT_TYPES}"
)
message = f"Unrecognized output_type '{output_type}' - should be in {ALLOWED_OUTPUT_TYPES}"
logger.error(message)
raise ValueError(message)

# TODO: We need this for csv - see if we can add this to get_schema
if file_suffix in ["csv", "tsv"]:
Expand All @@ -89,7 +97,9 @@ def vlmd_validate(
logger.debug("Getting csv data from file")
data = read_delim(input_file).to_dict(orient="records")
if len(data) == 0:
raise ValidationError("Could not read csv data from input")
message = "Could not read csv data from input"
logger.error(message)
raise ValidationError(message)
elif file_suffix == "json":
logger.debug("Getting json data from file")
data = read_data_from_json_file(input_file)
Expand All @@ -106,9 +116,9 @@ def vlmd_validate(
# convert
input_type = file_type_to_fxn_map.get(file_suffix)
if not input_type:
raise ExtractionError(
f"Could not get conversion function from file_suffix '{file_suffix}'"
)
message = f"Could not get conversion function from file_suffix '{file_suffix}'"
logger.error(message)
raise ExtractionError(message)
data_dictionaries = {}
logger.debug(f"Verifying vlmd can be converted using input_type '{input_type}'")
data_dictionary_props = {}
Expand Down Expand Up @@ -139,7 +149,9 @@ def vlmd_validate(
# TODO: see if we can add this to get_schema
schema = add_types_to_props(schema)
if schema is None:
raise ValueError(f"Could not get schema for type = {schema_type}")
message = f"Could not get schema for type = {schema_type}"
logger.error(message)
raise ValueError(message)

try:
jsonschema.validate(instance=converted_dictionary, schema=schema)
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ pytest = "^7.0.0"
pytest-cov = "*"
requests-mock = "*"

[tool.poetry.scripts]
heal = "heal.cli.heal_cli:main"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json

import pytest

from heal.vlmd.config import (
Expand Down
Loading

0 comments on commit 5a8ce80

Please sign in to comment.