Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Welling/fix ome tiff field validator #69

Merged
merged 13 commits into from
Dec 12, 2024
92 changes: 46 additions & 46 deletions src/ingest_validation_tests/ome_tiff_field_validator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import json
import re

# from functools import partial
# from multiprocessing import Pool
from functools import partial
from multiprocessing import Pool
from os import cpu_count
from pathlib import Path
from typing import List, Optional
Expand All @@ -17,57 +16,58 @@ def _log(message: str):
print(message)


def expand_terms(dct: dict, prefix: str = "") -> dict:
def expand_terms(dct: dict, prefix: str = "") -> list:
"""
Convert a dict of of XML info as provided by xmlschema to the
form used in the dictionary of expected fields
"""
rslt = {}
rslt = []
expanded_prefix = prefix + "_" if prefix else ""
for key, val in dct.items():
if key.startswith("@"): # terminal element
rslt[expanded_prefix + key[1:]] = val
rslt.append((expanded_prefix + key[1:], val))
elif key == "$" and isinstance(val, str): # special case?
rslt[expanded_prefix + key] = val
rslt.append((expanded_prefix + key, val))
else:
child_dct = {}
child_list_list = []
if isinstance(val, list):
assert len(val) == 1, "Expected only one element in list of dicts"
child_dct.update(expand_terms(val[0], expanded_prefix + key))
for elt in val:
child_list_list.append(expand_terms(elt, expanded_prefix + key))
elif isinstance(val, dict):
child_dct.update(expand_terms(val, expanded_prefix + key))
child_list_list.append(expand_terms(val, expanded_prefix + key))
elif val is None:
child_dct[expanded_prefix + key] = None
child_list_list.append([(expanded_prefix + key, None)])
else:
raise ValueError(f"list or dict expected; got {type(val)} {val}")
for key, val in child_dct.items():
rslt[key] = val
for child_list in child_list_list:
for key, val in child_list:
rslt.append((key, val))
return rslt


def check_one_prop(key: str, all_prop_dct: dict, this_test: dict) -> None:
def check_one_prop(key: str, all_prop_list: list, this_test: dict) -> None:
all_prop_keys = set(key for key, val in all_prop_list)
test_type = this_test["dtype"]
if test_type == "trap":
# This test is useful when you want to scan lots of ome-tiff files for an
# example of a new field type
if key in all_prop_dct:
raise RuntimeError(f"TRAP: {key} -> {all_prop_dct[key]} vs {this_test}")
if key in all_prop_keys:
raise RuntimeError(f"TRAP: {key} in {all_prop_keys} vs {this_test}")
else:
pass
elif test_type == "categorical":
allowed_vals = this_test["allowed_values"]
assert key in all_prop_dct, f"{key} is required but missing"
assert all_prop_dct[key] in allowed_vals, (
f"{key} = {all_prop_dct[key]}" f" not one of {allowed_vals}"
)
assert key in all_prop_keys, f"{key} is required but missing"
for val in [thisval for thiskey, thisval in all_prop_list if thiskey == key]:
assert val in allowed_vals, f"{key} == {val} is not one of {allowed_vals}"
elif test_type == "integer":
assert key in all_prop_dct, f"{key} is required but missing"
assert isinstance(all_prop_dct[key], int), f"{key} = {all_prop_dct[key]}" f" is not an int"
assert key in all_prop_keys, f"{key} is required but missing"
for val in [thisval for thiskey, thisval in all_prop_list if thiskey == key]:
assert isinstance(val, int), f"{key} = {val} is not an int"
elif test_type == "float":
assert key in all_prop_dct, f"{key} is required but missing"
assert isinstance(all_prop_dct[key], float), (
f"{key} = {all_prop_dct[key]}" f" is not a float"
)
assert key in all_prop_keys, f"{key} is required but missing"
for val in [thisval for thiskey, thisval in all_prop_list if thiskey == key]:
assert isinstance(val, float), f"{key} = {val} is not a float"
else:
raise NotImplementedError(f"Unimplemented dtype {test_type} for ome-tiff field")

Expand All @@ -76,10 +76,14 @@ def _check_ome_tiff_file(file: str, /, tests: dict) -> Optional[str]:
try:
with tifffile.TiffFile(file) as tf:
xml_document = xmlschema.XmlDocument(tf.ome_metadata)
except Exception:
return f"{file} is not a valid OME.TIFF file: Failed to read OME XML"

try:
image_props = xmlschema.to_dict(xml_document)["Image"]
expanded_props = {}
expanded_props = []
for term_dct in image_props:
expanded_props.update(expand_terms(term_dct))
expanded_props.extend(expand_terms(term_dct))
error_l = []
for key in tests:
try:
Expand Down Expand Up @@ -113,8 +117,11 @@ def collect_errors(self, **kwargs) -> List[Optional[str]]:
if re.fullmatch(test_set["re"], self.assay_type):
all_tests.update(test_set["fields"])

if not all_tests:
return [] # nothing to test for this assay

threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1
# pool = Pool(threads)
pool = Pool(threads)
_log(f"Threading at OmeTiffFieldValidator with {threads}")
filenames_to_test = []
for glob_expr in [
Expand All @@ -127,23 +134,16 @@ def collect_errors(self, **kwargs) -> List[Optional[str]]:
for file in path.glob(glob_expr):
filenames_to_test.append(file)

# TODO: turn back on when issues with XML parsing are resolved
# still collecting files so we know if this plugin *should* have run

# rslt_list: List[Optional[str]] = list(
# rslt
# for rslt in pool.imap_unordered(
# partial(_check_ome_tiff_file, tests=all_tests), filenames_to_test
# )
# if rslt is not None
# )
# if rslt_list:
# return rslt_list
# elif filenames_to_test:
if filenames_to_test:
_log(
f"Found files to test but skipping ome-tiff field validation. Files: {filenames_to_test}"
rslt_list: List[Optional[str]] = list(
rslt
for rslt in pool.imap_unordered(
partial(_check_ome_tiff_file, tests=all_tests), filenames_to_test
)
if rslt is not None
)
if rslt_list:
return rslt_list
elif filenames_to_test:
return [None]
else:
return []
2 changes: 1 addition & 1 deletion src/ingest_validation_tests/ome_tiff_fields.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"re": "C.*EX",
"re": ".*",
"fields": {
"Pixels_DimensionOrder": {
"dtype": "categorical",
Expand Down
2 changes: 1 addition & 1 deletion src/ingest_validation_tests/ome_tiff_fields_schema.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"$schema": "http://json-schema.org/schema#",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "http://schemata.hubmapconsortium.org/ome_tiff_fields_schema_schema.json",
"title": "ome-tiff fields schema",
"description": "schema for the definitions file of required ome-tiff fields",
Expand Down
Binary file added test_data/complex_small_ome_tiff.zip
Binary file not shown.
28 changes: 19 additions & 9 deletions tests/test_ome_tiff_field_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,27 @@
@pytest.mark.parametrize(
("test_data_fname", "msg_re_list", "assay_type"),
(
# (
# "test_data/codex_tree_ometiff_bad.zip",
# [
# ".*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*",
# ".*sample1.ome.tif is not a valid OME.TIFF file.*",
# ".*sample2.ome.tif is not a valid OME.TIFF file.*",
# ],
# "CODEX",
# ),
(
"test_data/codex_tree_ometiff_bad.zip",
[
".*tubhiswt_C0_bad.ome.tif is not a valid OME.TIFF file.*",
".*sample1.ome.tif is not a valid OME.TIFF file.*",
".*sample2.ome.tif is not a valid OME.TIFF file.*",
],
"CODEX",
),
("test_data/codex_tree_ometiff_good.zip", [], "CODEX"),
("test_data/fake_snrnaseq_tree_good.zip", [], "snRNAseq"),
(
"test_data/complex_small_ome_tiff.zip",
[
".*complex_small_ome_tiff/917_cropped_0_Z0_C3_T0.ome.tiff is not"
" a valid OME.TIFF file: Pixels_PhysicalSizeX is required but missing;"
" Pixels_PhysicalSizeY is required but missing;"
" Pixels_PhysicalSizeZ is required but missing"
],
"PAS",
),
),
)
def test_ome_tiff_field_validator(test_data_fname, msg_re_list, assay_type, tmp_path):
Expand Down