From 89e3ee997bb4289b033e421d327252d81cbd4607 Mon Sep 17 00:00:00 2001 From: Minh Nguyen Date: Tue, 19 Nov 2024 20:37:17 +0100 Subject: [PATCH] add model parsing and tests for distributions - support secorolab/metamodels#13 - add DistributionModel that for parsing univariate and multivariate versions of the uniform and normal distributions, as well as uniform rotation (3D) distribution - add method to sample using info from DistributionModel - add SampledQuantityModel that cache sample when requested - add unit test on a valid model that check sampling of all supported distributions - minor: use check_shacl_constraints func in test_python_model - minor: remove black precommit hook since conflicting with ruff --- .pre-commit-config.yaml | 4 - src/rdf_utils/constraints.py | 2 +- src/rdf_utils/models/common.py | 12 +- src/rdf_utils/models/distribution.py | 244 +++++++++++++++++++++++++++ src/rdf_utils/namespace.py | 3 +- src/rdf_utils/uri.py | 4 + tests/test_distribution.py | 126 ++++++++++++++ tests/test_python_model.py | 13 +- 8 files changed, 389 insertions(+), 19 deletions(-) create mode 100644 src/rdf_utils/models/distribution.py create mode 100644 tests/test_distribution.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2ffb058..bf0c129 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,8 +1,4 @@ repos: -- repo: https://github.com/psf/black - rev: 24.8.0 - hooks: - - id: black - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.5.6 hooks: diff --git a/src/rdf_utils/constraints.py b/src/rdf_utils/constraints.py index 76c4c2f..a90e67f 100644 --- a/src/rdf_utils/constraints.py +++ b/src/rdf_utils/constraints.py @@ -16,7 +16,7 @@ def __init__(self, violation_str: str): def check_shacl_constraints(graph: Graph, shacl_dict: Dict[str, str], quiet=False) -> bool: """ - :param graph: rdfl.Graph to be checked + :param graph: rdflib.Graph to be checked :param shacl_dict: mapping from SHACL path to graph format, e.g. URL -> "turtle" :param quiet: if true will not throw an exception """ diff --git a/src/rdf_utils/models/common.py b/src/rdf_utils/models/common.py index d1cc5cd..e437bb1 100644 --- a/src/rdf_utils/models/common.py +++ b/src/rdf_utils/models/common.py @@ -25,15 +25,21 @@ class ModelBase(object): types: set[URIRef] _attributes: Dict[URIRef, Any] - def __init__(self, node_id: URIRef, graph: Optional[Graph] = None, types: Optional[set[URIRef]] = None) -> None: + def __init__( + self, node_id: URIRef, graph: Optional[Graph] = None, types: Optional[set[URIRef]] = None + ) -> None: self.id = node_id if graph is not None: self.types = get_node_types(graph=graph, node_id=node_id) - assert types is None, f"ModelBase.__init__: node '{node_id}': both 'graph' and 'types' args are not None" + assert ( + types is None + ), f"ModelBase.__init__: node '{node_id}': both 'graph' and 'types' args are not None" elif types is not None: self.types = types else: - raise RuntimeError(f"ModelBase.__init__: node '{node_id}': neither 'graph' or 'types' specified") + raise RuntimeError( + f"ModelBase.__init__: node '{node_id}': neither 'graph' or 'types' specified" + ) assert len(self.types) > 0, f"node '{self.id}' has no type" self._attributes = {} diff --git a/src/rdf_utils/models/distribution.py b/src/rdf_utils/models/distribution.py new file mode 100644 index 0000000..5ea5e0d --- /dev/null +++ b/src/rdf_utils/models/distribution.py @@ -0,0 +1,244 @@ +# SPDX-Litense-Identifier: MPL-2.0 +from typing import Any, Optional +import numpy as np +from rdflib import BNode, Literal, URIRef, Graph +from rdf_utils.collection import load_list_re +from rdf_utils.models.common import ModelBase +from rdf_utils.namespace import NS_MM_DISTRIB + + +URI_DISTRIB_TYPE_DISTRIB = NS_MM_DISTRIB["Distribution"] +URI_DISTRIB_PRED_DIM = NS_MM_DISTRIB["dimension"] + +URI_DISTRIB_TYPE_CONT = NS_MM_DISTRIB["Continuous"] +URI_DISTRIB_TYPE_DISCR = NS_MM_DISTRIB["Discrete"] + +URI_DISTRIB_TYPE_UNIFORM = NS_MM_DISTRIB["Uniform"] +URI_DISTRIB_PRED_UPPER = NS_MM_DISTRIB["upper-bound"] +URI_DISTRIB_PRED_LOWER = NS_MM_DISTRIB["lower-bound"] + +URI_DISTRIB_TYPE_NORMAL = NS_MM_DISTRIB["Normal"] +URI_DISTRIB_PRED_MEAN = NS_MM_DISTRIB["mean"] +URI_DISTRIB_PRED_STD = NS_MM_DISTRIB["standard-deviation"] +URI_DISTRIB_PRED_COV = NS_MM_DISTRIB["covariance"] + +URI_DISTRIB_TYPE_UNIFORM_ROT = NS_MM_DISTRIB["UniformRotation"] + +URI_DISTRIB_TYPE_SAMPLED_QUANTITY = NS_MM_DISTRIB["SampledQuantity"] +URI_DISTRIB_PRED_FROM_DISTRIB = NS_MM_DISTRIB["from-distribution"] + + +def _get_float_from_literal(literal: Literal) -> float: + try: + lit_val = literal.toPython() + return float(lit_val) + except ValueError as e: + raise ValueError(f"can't convert literal '{literal}' as float: {e}") + + +class DistributionModel(ModelBase): + distrib_type: URIRef + + def __init__(self, distrib_id: URIRef, graph: Graph) -> None: + super().__init__(node_id=distrib_id, graph=graph) + + if URI_DISTRIB_TYPE_UNIFORM_ROT in self.types: + self.distrib_type = URI_DISTRIB_TYPE_UNIFORM_ROT + elif URI_DISTRIB_TYPE_UNIFORM in self.types: + self.distrib_type = URI_DISTRIB_TYPE_UNIFORM + self._load_uniform_distrib_attrs(graph=graph) + elif URI_DISTRIB_TYPE_NORMAL in self.types: + self.distrib_type = URI_DISTRIB_TYPE_NORMAL + self._load_normal_distrib_attrs(graph=graph) + else: + raise RuntimeError(f"Distrib '{self.id}' has unhandled types: {self.types}") + + def _load_uniform_distrib_attrs(self, graph: Graph) -> None: + # dimension + dim_node = graph.value(subject=self.id, predicate=URI_DISTRIB_PRED_DIM) + assert isinstance( + dim_node, Literal + ), f"Uniform distrib '{self.id}' does not have a Literal 'dimension': {dim_node}" + dim = dim_node.toPython() + assert ( + isinstance(dim, int) and dim > 0 + ), f"Uniform distrib '{self.id}' does not have a positive integer 'dimension': {dim}" + + upper_bounds = None + lower_bounds = None + + # upper bound(s) + upper_node = graph.value(subject=self.id, predicate=URI_DISTRIB_PRED_UPPER) + if isinstance(upper_node, Literal): + upper_val = _get_float_from_literal(upper_node) + upper_bounds = [upper_val] + elif isinstance(upper_node, BNode): + upper_bounds = load_list_re( + graph=graph, first_node=upper_node, parse_uri=False, quiet=False + ) + else: + raise RuntimeError( + f"Uniform distrib '{self.id}' has invalid type for :upper-bound: {type(upper_node)}" + ) + + # lower bound(s) + lower_node = graph.value(subject=self.id, predicate=URI_DISTRIB_PRED_LOWER) + if isinstance(lower_node, Literal): + lower_val = _get_float_from_literal(lower_node) + lower_bounds = [lower_val] + elif isinstance(lower_node, BNode): + lower_bounds = load_list_re( + graph=graph, first_node=lower_node, parse_uri=False, quiet=False + ) + else: + raise RuntimeError( + f"Uniform distrib '{self.id}' has invalid type for lower-bound: {type(lower_node)}" + ) + + # check property dimensions + assert ( + dim == len(lower_bounds) and dim == len(upper_bounds) + ), f"Uniform distrib '{self.id}' has mismatching property dimensions: dim={dim}, upper bounds num={len(upper_bounds)}, lower bounds num={len(lower_bounds)}" + + # check lower bounds less than higher bounds + less_than = np.less(lower_bounds, upper_bounds) + assert np.all( + less_than + ), f"Uniform distrib '{self.id}': not all lower bounds less than upper bounds: lower={lower_bounds}, upper={upper_bounds}" + + # set attributes + self.set_attr(key=URI_DISTRIB_PRED_DIM, val=dim) + self.set_attr(key=URI_DISTRIB_PRED_UPPER, val=upper_bounds) + self.set_attr(key=URI_DISTRIB_PRED_LOWER, val=lower_bounds) + + def _load_normal_distrib_attrs(self, graph: Graph) -> None: + # dimension + dim_node = graph.value(subject=self.id, predicate=URI_DISTRIB_PRED_DIM) + assert isinstance( + dim_node, Literal + ), f"Normal distrib '{self.id}' does not have a Literal 'dimension': {dim_node}" + dim = dim_node.toPython() + assert ( + isinstance(dim, int) and dim > 0 + ), f"Normal distrib '{self.id}' does not have a positive integer 'dimension': {dim}" + self.set_attr(key=URI_DISTRIB_PRED_DIM, val=dim) + + # get mean + mean_node = graph.value(subject=self.id, predicate=URI_DISTRIB_PRED_MEAN) + if isinstance(mean_node, Literal): + assert ( + dim == 1 + ), f"Normal distrib '{self.id}' has single mean '{mean_node}' but dimension '{dim}'" + mean_val = _get_float_from_literal(mean_node) + self.set_attr(key=URI_DISTRIB_PRED_MEAN, val=[mean_val]) + elif isinstance(mean_node, BNode): + mean_vals = load_list_re( + graph=graph, first_node=mean_node, parse_uri=False, quiet=False + ) + assert ( + len(mean_vals) == dim + ), f"Normal distrib '{self.id}': number of mean values ({len(mean_vals)}) does not match dimension ({dim})" + self.set_attr(key=URI_DISTRIB_PRED_MEAN, val=mean_vals) + else: + raise RuntimeError( + f"Normal distrib '{self.id}' has invalid type for 'mean': {type(mean_node)}" + ) + + # get standard deviation or covariance based on dimension + if dim == 1: + std_node = graph.value(subject=self.id, predicate=URI_DISTRIB_PRED_STD) + assert isinstance( + std_node, Literal + ), f"Normal distrib '{self.id}' does not have a Literal 'standard-deviation': {std_node}" + std = _get_float_from_literal(std_node) + self.set_attr(key=URI_DISTRIB_PRED_STD, val=std) + else: + cov_node = graph.value(subject=self.id, predicate=URI_DISTRIB_PRED_COV) + assert isinstance( + cov_node, BNode + ), f"Normal distrib '{self.id}': 'covariance' property not a container, type={type(cov_node)}" + cov_vals = load_list_re(graph=graph, first_node=cov_node, parse_uri=False, quiet=False) + try: + cov_mat = np.array(cov_vals, dtype=float) + except ValueError as e: + raise ValueError( + f"Normal distrib '{self.id}', can't convert covariance to float numpy array: {e}\n{cov_vals}" + ) + assert ( + cov_mat.shape + == ( + dim, + dim, + ) + ), f"Normal distrib '{self.id}': dimension='{dim}' doesn't match 'covariance' shape'{cov_mat.shape}'" + self.set_attr(key=URI_DISTRIB_PRED_COV, val=cov_mat) + + +def sample_from_distrib( + distrib: DistributionModel, size: Optional[int | tuple[int, ...]] = None +) -> Any: + if URI_DISTRIB_TYPE_UNIFORM_ROT in distrib.types: + try: + from scipy.spatial.transform import Rotation + except ImportError: + raise RuntimeError("to sample random rotations, 'scipy' must be installed") + + return Rotation.random() + + if URI_DISTRIB_TYPE_UNIFORM in distrib.types: + lower_bounds = distrib.get_attr(key=URI_DISTRIB_PRED_LOWER) + upper_bounds = distrib.get_attr(key=URI_DISTRIB_PRED_UPPER) + assert isinstance(lower_bounds, list) and isinstance( + upper_bounds, list + ), f"Uniform distrib '{distrib.id}' does not have valid lower & upper bounds" + return np.random.uniform(lower_bounds, upper_bounds, size=size) + + if URI_DISTRIB_TYPE_NORMAL in distrib.types: + dim = distrib.get_attr(key=URI_DISTRIB_PRED_DIM) + assert ( + isinstance(dim, int) and dim > 0 + ), f"Normal distrib '{distrib.id}' does not have valid dimension: {dim}" + + mean = distrib.get_attr(key=URI_DISTRIB_PRED_MEAN) + assert ( + isinstance(mean, list) and len(mean) == dim + ), f"Normal distrib '{distrib.id}' does not have valid mean: {mean}" + + if dim == 1: + std = distrib.get_attr(key=URI_DISTRIB_PRED_STD) + assert isinstance( + std, float + ), f"Normal distrib '{distrib.id}' does not have valid standard deviation: {std}" + return np.random.normal(loc=mean[0], scale=std, size=size) + + # multivariate normal + cov = distrib.get_attr(key=URI_DISTRIB_PRED_COV) + assert isinstance( + cov, np.ndarray + ), f"Normal distrib '{distrib.id}' does not have valid covariance: {cov}" + return np.random.multivariate_normal(mean=mean, cov=cov, size=size) + + raise RuntimeError(f"Distrib '{distrib.id}' has unhandled types: {distrib.types}") + + +class SampledQuantityModel(ModelBase): + distribution: DistributionModel + _sampled_value: Optional[Any] + + def __init__(self, quantity_id: URIRef, graph: Graph) -> None: + super().__init__(node_id=quantity_id, graph=graph) + + distrib_id = graph.value(subject=self.id, predicate=URI_DISTRIB_PRED_FROM_DISTRIB) + assert isinstance( + distrib_id, URIRef + ), f"SampledQuantity '{self.id}' does not link to a distribution node: {distrib_id}" + self.distribution = DistributionModel(distrib_id=distrib_id, graph=graph) + + self._sampled_value = None + + def sample(self, resample: bool = True) -> Any: + if not resample and self._sampled_value is not None: + return self._sampled_value + + self._sampled_value = sample_from_distrib(distrib=self.distribution) + return self._sampled_value diff --git a/src/rdf_utils/namespace.py b/src/rdf_utils/namespace.py index e564711..250ecee 100644 --- a/src/rdf_utils/namespace.py +++ b/src/rdf_utils/namespace.py @@ -2,6 +2,7 @@ from rdflib import Namespace from rdf_utils.uri import ( URI_MM_AGN, + URI_MM_DISTRIB, URI_MM_GEOM, URI_MM_GEOM_REL, URI_MM_GEOM_COORD, @@ -17,8 +18,8 @@ NS_MM_GEOM_COORD = Namespace(URI_MM_GEOM_COORD) NS_MM_PYTHON = Namespace(URI_MM_PYTHON) - NS_MM_ENV = Namespace(URI_MM_ENV) NS_MM_AGN = Namespace(URI_MM_AGN) NS_MM_TIME = Namespace(URI_MM_TIME) NS_MM_EL = Namespace(URI_MM_EL) +NS_MM_DISTRIB = Namespace(URI_MM_DISTRIB) diff --git a/src/rdf_utils/uri.py b/src/rdf_utils/uri.py index 250fad6..b35c6ec 100644 --- a/src/rdf_utils/uri.py +++ b/src/rdf_utils/uri.py @@ -25,6 +25,10 @@ URL_MM_EL_JSON = f"{URL_SECORO_MM}/behaviour/event_loop.json" URL_MM_EL_SHACL = f"{URL_SECORO_MM}/behaviour/event_loop.shacl.ttl" +URI_MM_DISTRIB = f"{URL_SECORO_MM}/probability/distribution#" +URL_MM_DISTRIB_JSON = f"{URL_SECORO_MM}/probability/distribution.json" +URL_MM_DISTRIB_SHACL = f"{URL_SECORO_MM}/probability/distribution.shacl.ttl" + def try_expand_curie( ns_manager: NamespaceManager, curie_str: str, quiet: bool = False diff --git a/tests/test_distribution.py b/tests/test_distribution.py new file mode 100644 index 0000000..ad70a0a --- /dev/null +++ b/tests/test_distribution.py @@ -0,0 +1,126 @@ +# SPDX-Litense-Identifier: MPL-2.0 +import unittest +import numpy as np +from rdflib import Graph, URIRef +from rdf_utils.constraints import check_shacl_constraints +from rdf_utils.models.distribution import ( + DistributionModel, + SampledQuantityModel, + sample_from_distrib, +) +from rdf_utils.resolver import install_resolver +from rdf_utils.uri import URL_MM_DISTRIB_JSON, URL_MM_DISTRIB_SHACL, URL_SECORO_M + +# random distribution params +NUM_SAMPLE = 20 +DIM = 7 +RAND_NUMS = np.round(np.random.rand(DIM) * 100 - 50, decimals=3) # numbers in range [-50, 50) +RAND_RANGE = np.random.randint(10, 20, size=DIM) +RAND_SQUARE_MAT = np.random.uniform(-50, 50, size=(DIM, DIM)) +RAND_COV = np.round( + np.dot(np.transpose(RAND_SQUARE_MAT), RAND_SQUARE_MAT), decimals=3 +) # covariance of shape (DIM, DIM) + + +def get_matrix_string(matrix): + row_strs = [] + for row in matrix: + row_strs.append(f"[{", ".join(map(str, row))}]") + return f"[{", ".join(row_strs)}]" + + +# JSON-LD model +URI_TEST = f"{URL_SECORO_M}/tests/collection" +URI_TEST_UNI_ROT = f"{URI_TEST}/uniform-rotation" +URI_TEST_SAMPLED_ROT = f"{URI_TEST}/sampled-rotation" +URI_TEST_UNIFORM_UNI = f"{URI_TEST}/uniform-univariate" +URI_TEST_UNIFORM_MULTI = f"{URI_TEST}/uniform-multivariate" +URI_TEST_NORMAL_UNI = f"{URI_TEST}/normal-univariate" +URI_TEST_NORMAL_MULTI = f"{URI_TEST}/normal-multivariate" +VALID_DISTRIB_MODEL = f""" +{{ + "@context": [ + "{URL_MM_DISTRIB_JSON}" + ], + "@graph": [ + {{ "@id": "{URI_TEST_UNI_ROT}", "@type": [ "Distribution", "UniformRotation" ] }}, + {{ + "@id": "{URI_TEST_SAMPLED_ROT}", "@type": [ "SampledQuantity" ], + "from-distribution": "{URI_TEST_UNI_ROT}" + }}, + {{ + "@id": "{URI_TEST_UNIFORM_UNI}", "@type": [ "Distribution", "Uniform" ], + "dimension": 1, "lower-bound": {RAND_NUMS[0]}, "upper-bound": {RAND_NUMS[0] + RAND_RANGE[0]} + }}, + {{ + "@id": "{URI_TEST_UNIFORM_MULTI}", "@type": [ "Distribution", "Uniform" ], "dimension": {DIM}, + "lower-bound": [ {", ".join(map(str, RAND_NUMS))} ], + "upper-bound": [ {", ".join(map(str, RAND_NUMS + RAND_RANGE))} ] + }}, + {{ + "@id": "{URI_TEST_NORMAL_UNI}", "@type": [ "Distribution", "Normal" ], + "dimension": 1, "mean": {RAND_NUMS[0]}, "std-dev": {float(RAND_RANGE[0])} + }}, + {{ + "@id": "{URI_TEST_NORMAL_MULTI}", "@type": [ "Distribution", "Normal" ], "dimension": {DIM}, + "mean": [ {", ".join(map(str, RAND_NUMS))} ], + "covariance": {get_matrix_string(RAND_COV)} + }} + ] +}} +""" + + +class DistributionTest(unittest.TestCase): + def setUp(self): + install_resolver() + + def test_correct_distrib_models(self): + correct_g = Graph() + correct_g.parse(data=VALID_DISTRIB_MODEL, format="json-ld") + + check_shacl_constraints( + graph=correct_g, shacl_dict={URL_MM_DISTRIB_SHACL: "turtle"}, quiet=False + ) + + # uniform rotation with resampling + uni_rot = SampledQuantityModel(quantity_id=URIRef(URI_TEST_SAMPLED_ROT), graph=correct_g) + rot1 = uni_rot.sample() + rot2 = uni_rot.sample(resample=False) + rot3 = uni_rot.sample(resample=True) + self.assertIs( + rot1, rot2, "SampledQuantityModel.sample does not cache when 'resample' is False" + ) + self.assertIsNot( + rot1, rot3, "SampledQuantityModel.sample does not get new value 'resample' is True" + ) + rot1_quat = rot1.as_quat() + self.assertTrue(len(rot1_quat) == 4, "random rotation did not return a valid quaternion") + + # univariate uniform distribution + uniform_uni = DistributionModel(distrib_id=URIRef(URI_TEST_UNIFORM_UNI), graph=correct_g) + uniform_uni_samples = sample_from_distrib(distrib=uniform_uni, size=NUM_SAMPLE) + self.assertTrue(len(uniform_uni_samples) == NUM_SAMPLE) + + # multivariate uniform distribution + uniform_multi = DistributionModel( + distrib_id=URIRef(URI_TEST_UNIFORM_MULTI), graph=correct_g + ) + uniform_multi_samples = sample_from_distrib(distrib=uniform_multi, size=(NUM_SAMPLE, DIM)) + self.assertTrue( + uniform_multi_samples.shape == (NUM_SAMPLE, DIM), + f"sampling multivariate uniform distribution returns unexpected shape: {uniform_multi_samples.shape}", + ) + + # univariate normal distribution + normal_uni = DistributionModel(distrib_id=URIRef(URI_TEST_NORMAL_UNI), graph=correct_g) + normal_uni_samples = sample_from_distrib(distrib=normal_uni, size=NUM_SAMPLE) + self.assertTrue(len(normal_uni_samples) == NUM_SAMPLE) + + # multivariate normal distribution + normal_multi = DistributionModel(distrib_id=URIRef(URI_TEST_NORMAL_MULTI), graph=correct_g) + normal_multi_samples = sample_from_distrib(distrib=normal_multi, size=(NUM_SAMPLE, DIM)) + self.assertTrue( + normal_multi_samples.shape == (NUM_SAMPLE, DIM, DIM), + f"sampling multivariate normal distribution returns unexpected shape: {normal_multi_samples.shape}", + ) diff --git a/tests/test_python_model.py b/tests/test_python_model.py index f1ab2a4..e02d7e1 100644 --- a/tests/test_python_model.py +++ b/tests/test_python_model.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MPL-2.0 import unittest from urllib.request import urlopen -import pyshacl from rdflib import Graph, URIRef +from rdf_utils.constraints import check_shacl_constraints from rdf_utils.models.common import ModelBase, ModelLoader from rdf_utils.uri import URL_MM_PYTHON_JSON, URL_MM_PYTHON_SHACL, URL_SECORO_M from rdf_utils.resolver import install_resolver @@ -43,16 +43,9 @@ def test_python_import(self): graph = Graph() graph.parse(data=PYTHON_MODEL, format="json-ld") - shacl_g = Graph() - shacl_g.parse(URL_MM_PYTHON_SHACL, format="turtle") - conforms, _, report_text = pyshacl.validate( - graph, - shacl_graph=shacl_g, - data_graph_format="json-ld", - shacl_graph_format="ttl", - inference="rdfs", + check_shacl_constraints( + graph=graph, shacl_dict={URL_MM_PYTHON_SHACL: "turtle"}, quiet=False ) - self.assertTrue(conforms, f"SHACL validation failed:\n{report_text}") os_path_exists = import_attr_from_node(graph, URI_OS_PATH_EXISTS) self.assertTrue(os_path_exists(self.mm_python_shacl_path))