diff --git a/src/ingest_validation_tests/ome_tiff_field_validator.py b/src/ingest_validation_tests/ome_tiff_field_validator.py index 0f37bfe..21ad1f6 100644 --- a/src/ingest_validation_tests/ome_tiff_field_validator.py +++ b/src/ingest_validation_tests/ome_tiff_field_validator.py @@ -1,8 +1,7 @@ import json import re - -# from functools import partial -# from multiprocessing import Pool +from functools import partial +from multiprocessing import Pool from os import cpu_count from pathlib import Path from typing import List, Optional @@ -17,57 +16,58 @@ def _log(message: str): print(message) -def expand_terms(dct: dict, prefix: str = "") -> dict: +def expand_terms(dct: dict, prefix: str = "") -> list: """ Convert a dict of of XML info as provided by xmlschema to the form used in the dictionary of expected fields """ - rslt = {} + rslt = [] expanded_prefix = prefix + "_" if prefix else "" for key, val in dct.items(): if key.startswith("@"): # terminal element - rslt[expanded_prefix + key[1:]] = val + rslt.append((expanded_prefix + key[1:], val)) elif key == "$" and isinstance(val, str): # special case? - rslt[expanded_prefix + key] = val + rslt.append((expanded_prefix + key, val)) else: - child_dct = {} + child_list_list = [] if isinstance(val, list): - assert len(val) == 1, "Expected only one element in list of dicts" - child_dct.update(expand_terms(val[0], expanded_prefix + key)) + for elt in val: + child_list_list.append(expand_terms(elt, expanded_prefix + key)) elif isinstance(val, dict): - child_dct.update(expand_terms(val, expanded_prefix + key)) + child_list_list.append(expand_terms(val, expanded_prefix + key)) elif val is None: - child_dct[expanded_prefix + key] = None + child_list_list.append([(expanded_prefix + key, None)]) else: raise ValueError(f"list or dict expected; got {type(val)} {val}") - for key, val in child_dct.items(): - rslt[key] = val + for child_list in child_list_list: + for key, val in child_list: + rslt.append((key, val)) return rslt -def check_one_prop(key: str, all_prop_dct: dict, this_test: dict) -> None: +def check_one_prop(key: str, all_prop_list: list, this_test: dict) -> None: + all_prop_keys = set(key for key, val in all_prop_list) test_type = this_test["dtype"] if test_type == "trap": # This test is useful when you want to scan lots of ome-tiff files for an # example of a new field type - if key in all_prop_dct: - raise RuntimeError(f"TRAP: {key} -> {all_prop_dct[key]} vs {this_test}") + if key in all_prop_keys: + raise RuntimeError(f"TRAP: {key} in {all_prop_keys} vs {this_test}") else: pass elif test_type == "categorical": allowed_vals = this_test["allowed_values"] - assert key in all_prop_dct, f"{key} is required but missing" - assert all_prop_dct[key] in allowed_vals, ( - f"{key} = {all_prop_dct[key]}" f" not one of {allowed_vals}" - ) + assert key in all_prop_keys, f"{key} is required but missing" + for val in [thisval for thiskey, thisval in all_prop_list if thiskey == key]: + assert val in allowed_vals, f"{key} == {val} is not one of {allowed_vals}" elif test_type == "integer": - assert key in all_prop_dct, f"{key} is required but missing" - assert isinstance(all_prop_dct[key], int), f"{key} = {all_prop_dct[key]}" f" is not an int" + assert key in all_prop_keys, f"{key} is required but missing" + for val in [thisval for thiskey, thisval in all_prop_list if thiskey == key]: + assert isinstance(val, int), f"{key} = {val} is not an int" elif test_type == "float": - assert key in all_prop_dct, f"{key} is required but missing" - assert isinstance(all_prop_dct[key], float), ( - f"{key} = {all_prop_dct[key]}" f" is not a float" - ) + assert key in all_prop_keys, f"{key} is required but missing" + for val in [thisval for thiskey, thisval in all_prop_list if thiskey == key]: + assert isinstance(val, float), f"{key} = {val} is not a float" else: raise NotImplementedError(f"Unimplemented dtype {test_type} for ome-tiff field") @@ -76,10 +76,14 @@ def _check_ome_tiff_file(file: str, /, tests: dict) -> Optional[str]: try: with tifffile.TiffFile(file) as tf: xml_document = xmlschema.XmlDocument(tf.ome_metadata) + except Exception: + return f"{file} is not a valid OME.TIFF file: Failed to read OME XML" + + try: image_props = xmlschema.to_dict(xml_document)["Image"] - expanded_props = {} + expanded_props = [] for term_dct in image_props: - expanded_props.update(expand_terms(term_dct)) + expanded_props.extend(expand_terms(term_dct)) error_l = [] for key in tests: try: @@ -113,8 +117,11 @@ def collect_errors(self, **kwargs) -> List[Optional[str]]: if re.fullmatch(test_set["re"], self.assay_type): all_tests.update(test_set["fields"]) + if not all_tests: + return [] # nothing to test for this assay + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 - # pool = Pool(threads) + pool = Pool(threads) _log(f"Threading at OmeTiffFieldValidator with {threads}") filenames_to_test = [] for glob_expr in [ @@ -127,23 +134,16 @@ def collect_errors(self, **kwargs) -> List[Optional[str]]: for file in path.glob(glob_expr): filenames_to_test.append(file) - # TODO: turn back on when issues with XML parsing are resolved - # still collecting files so we know if this plugin *should* have run - - # rslt_list: List[Optional[str]] = list( - # rslt - # for rslt in pool.imap_unordered( - # partial(_check_ome_tiff_file, tests=all_tests), filenames_to_test - # ) - # if rslt is not None - # ) - # if rslt_list: - # return rslt_list - # elif filenames_to_test: - if filenames_to_test: - _log( - f"Found files to test but skipping ome-tiff field validation. Files: {filenames_to_test}" + rslt_list: List[Optional[str]] = list( + rslt + for rslt in pool.imap_unordered( + partial(_check_ome_tiff_file, tests=all_tests), filenames_to_test ) + if rslt is not None + ) + if rslt_list: + return rslt_list + elif filenames_to_test: return [None] else: return [] diff --git a/src/ingest_validation_tests/ome_tiff_fields.json b/src/ingest_validation_tests/ome_tiff_fields.json index 0462d77..2c97318 100644 --- a/src/ingest_validation_tests/ome_tiff_fields.json +++ b/src/ingest_validation_tests/ome_tiff_fields.json @@ -1,6 +1,6 @@ [ { - "re": "C.*EX", + "re": ".*", "fields": { "Pixels_DimensionOrder": { "dtype": "categorical", diff --git a/src/ingest_validation_tests/ome_tiff_fields_schema.json b/src/ingest_validation_tests/ome_tiff_fields_schema.json index e88cefe..3837787 100644 --- a/src/ingest_validation_tests/ome_tiff_fields_schema.json +++ b/src/ingest_validation_tests/ome_tiff_fields_schema.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/schema#", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "http://schemata.hubmapconsortium.org/ome_tiff_fields_schema_schema.json", "title": "ome-tiff fields schema", "description": "schema for the definitions file of required ome-tiff fields", diff --git a/test_data/complex_small_ome_tiff.zip b/test_data/complex_small_ome_tiff.zip new file mode 100644 index 0000000..9df75ea Binary files /dev/null and b/test_data/complex_small_ome_tiff.zip differ diff --git a/tests/test_ome_tiff_field_validator.py b/tests/test_ome_tiff_field_validator.py index 7453d42..fc33f96 100644 --- a/tests/test_ome_tiff_field_validator.py +++ b/tests/test_ome_tiff_field_validator.py @@ -8,17 +8,27 @@ @pytest.mark.parametrize( ("test_data_fname", "msg_re_list", "assay_type"), ( - # ( - # "test_data/codex_tree_ometiff_bad.zip", - # [ - # ".*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*", - # ".*sample1.ome.tif is not a valid OME.TIFF file.*", - # ".*sample2.ome.tif is not a valid OME.TIFF file.*", - # ], - # "CODEX", - # ), + ( + "test_data/codex_tree_ometiff_bad.zip", + [ + ".*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*", + ".*sample1.ome.tif is not a valid OME.TIFF file.*", + ".*sample2.ome.tif is not a valid OME.TIFF file.*", + ], + "CODEX", + ), ("test_data/codex_tree_ometiff_good.zip", [], "CODEX"), ("test_data/fake_snrnaseq_tree_good.zip", [], "snRNAseq"), + ( + "test_data/complex_small_ome_tiff.zip", + [ + ".*complex_small_ome_tiff/917_cropped_0_Z0_C3_T0.ome.tiff is not" + " a valid OME.TIFF file: Pixels_PhysicalSizeX is required but missing;" + " Pixels_PhysicalSizeY is required but missing;" + " Pixels_PhysicalSizeZ is required but missing" + ], + "PAS", + ), ), ) def test_ome_tiff_field_validator(test_data_fname, msg_re_list, assay_type, tmp_path):