diff --git a/poetry.lock b/poetry.lock index 438b83061..746ce0027 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4,6 +4,7 @@ name = "alabaster" version = "0.7.13" description = "A configurable sidebar-enabled Sphinx theme" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -15,6 +16,7 @@ files = [ name = "altair" version = "4.2.0" description = "Altair: A declarative statistical visualization library for Python." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -37,6 +39,7 @@ dev = ["black", "docutils", "flake8", "ipython", "m2r", "mistune (<2.0.0)", "pyt name = "anyio" version = "4.1.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -58,6 +61,7 @@ trio = ["trio (>=0.23)"] name = "appnope" version = "0.1.3" description = "Disable App Nap on macOS >= 10.9" +category = "main" optional = false python-versions = "*" files = [ @@ -69,6 +73,7 @@ files = [ name = "argon2-cffi" version = "23.1.0" description = "Argon2 for Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -89,6 +94,7 @@ typing = ["mypy"] name = "argon2-cffi-bindings" version = "21.2.0" description = "Low-level CFFI bindings for Argon2" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -126,6 +132,7 @@ tests = ["pytest"] name = "arrow" version = "1.3.0" description = "Better dates & times for Python" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -139,12 +146,13 @@ types-python-dateutil = ">=2.8.10" [package.extras] doc = ["doc8", "sphinx (>=7.0.0)", "sphinx-autobuild", "sphinx-autodoc-typehints", "sphinx_rtd_theme (>=1.3.0)"] -test = ["dateparser (==1.*)", "pre-commit", "pytest", "pytest-cov", "pytest-mock", "pytz (==2021.1)", "simplejson (==3.*)"] +test = ["dateparser (>=1.0.0,<2.0.0)", "pre-commit", "pytest", "pytest-cov", "pytest-mock", "pytz (==2021.1)", "simplejson (>=3.0.0,<4.0.0)"] [[package]] name = "astroid" version = "2.15.8" description = "An abstract syntax tree for Python with inference support." +category = "dev" optional = false python-versions = ">=3.7.2" files = [ @@ -161,6 +169,7 @@ wrapt = {version = ">=1.11,<2", markers = "python_version < \"3.11\""} name = "asttokens" version = "2.4.1" description = "Annotate AST trees with source code positions" +category = "main" optional = false python-versions = "*" files = [ @@ -179,6 +188,7 @@ test = ["astroid (>=1,<2)", "astroid (>=2,<4)", "pytest"] name = "async-lru" version = "2.0.4" description = "Simple LRU cache for asyncio" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -193,6 +203,7 @@ typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} name = "attrs" version = "23.1.0" description = "Classes Without Boilerplate" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -211,6 +222,7 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte name = "babel" version = "2.13.1" description = "Internationalization utilities" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -236,6 +248,7 @@ files = [ name = "beautifulsoup4" version = "4.12.2" description = "Screen-scraping library" +category = "main" optional = false python-versions = ">=3.6.0" files = [ @@ -254,6 +267,7 @@ lxml = ["lxml"] name = "black" version = "23.11.0" description = "The uncompromising code formatter." +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -296,6 +310,7 @@ uvloop = ["uvloop (>=0.15.2)"] name = "bleach" version = "6.1.0" description = "An easy safelist-based HTML-sanitizing tool." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -314,6 +329,7 @@ css = ["tinycss2 (>=1.1.0,<1.3)"] name = "cachetools" version = "5.3.2" description = "Extensible memoizing collections and decorators" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -325,6 +341,7 @@ files = [ name = "certifi" version = "2023.11.17" description = "Python package for providing Mozilla's CA Bundle." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -336,6 +353,7 @@ files = [ name = "cffi" version = "1.16.0" description = "Foreign Function Interface for Python calling C code." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -400,6 +418,7 @@ pycparser = "*" name = "charset-normalizer" version = "3.3.2" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -499,6 +518,7 @@ files = [ name = "click" version = "8.1.7" description = "Composable command line interface toolkit" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -513,6 +533,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "click-log" version = "0.4.0" description = "Logging integration for Click" +category = "main" optional = false python-versions = "*" files = [ @@ -527,6 +548,7 @@ click = "*" name = "clickclick" version = "20.10.2" description = "Click utility functions" +category = "main" optional = false python-versions = "*" files = [ @@ -542,6 +564,7 @@ PyYAML = ">=3.11" name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -553,6 +576,7 @@ files = [ name = "comm" version = "0.2.0" description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -570,6 +594,7 @@ test = ["pytest"] name = "connexion" version = "2.14.2" description = "Connexion - API first applications with OpenAPI/Swagger and Flask" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -600,6 +625,7 @@ tests = ["MarkupSafe (>=0.23)", "aiohttp (>=2.3.10,<4)", "aiohttp-jinja2 (>=0.14 name = "coverage" version = "7.3.2" description = "Code coverage measurement for Python" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -667,6 +693,7 @@ toml = ["tomli"] name = "cryptography" version = "41.0.7" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -708,10 +735,27 @@ ssh = ["bcrypt (>=3.1.5)"] test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] +[[package]] +name = "dataclasses-json" +version = "0.6.1" +description = "Easily serialize dataclasses to and from JSON." +category = "main" +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "dataclasses_json-0.6.1-py3-none-any.whl", hash = "sha256:1bd8418a61fe3d588bb0079214d7fb71d44937da40742b787256fd53b26b6c80"}, + {file = "dataclasses_json-0.6.1.tar.gz", hash = "sha256:a53c220c35134ce08211a1057fd0e5bf76dc5331627c6b241cacbc570a89faae"}, +] + +[package.dependencies] +marshmallow = ">=3.18.0,<4.0.0" +typing-inspect = ">=0.4.0,<1" + [[package]] name = "dateparser" version = "1.2.0" description = "Date parsing library designed to parse dates from HTML pages" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -734,6 +778,7 @@ langdetect = ["langdetect"] name = "debugpy" version = "1.8.0" description = "An implementation of the Debug Adapter Protocol for Python" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -761,6 +806,7 @@ files = [ name = "decorator" version = "5.1.1" description = "Decorators for Humans" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -772,6 +818,7 @@ files = [ name = "defusedxml" version = "0.7.1" description = "XML bomb protection for Python stdlib modules" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -783,6 +830,7 @@ files = [ name = "deprecated" version = "1.2.14" description = "Python @deprecated decorator to deprecate old python classes, functions or methods." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -800,6 +848,7 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] name = "deprecation" version = "2.1.0" description = "A library to handle automated deprecations" +category = "main" optional = false python-versions = "*" files = [ @@ -814,6 +863,7 @@ packaging = "*" name = "dill" version = "0.3.7" description = "serialize all of Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -828,6 +878,7 @@ graph = ["objgraph (>=1.7.2)"] name = "docutils" version = "0.20.1" description = "Docutils -- Python Documentation Utilities" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -839,6 +890,7 @@ files = [ name = "entrypoints" version = "0.4" description = "Discover and load entry points from installed packages." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -850,6 +902,7 @@ files = [ name = "et-xmlfile" version = "1.1.0" description = "An implementation of lxml.xmlfile for the standard library" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -861,6 +914,7 @@ files = [ name = "exceptiongroup" version = "1.2.0" description = "Backport of PEP 654 (exception groups)" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -889,6 +943,7 @@ testing = ["hatch", "pre-commit", "pytest", "tox"] name = "executing" version = "2.0.1" description = "Get the currently executing AST node of a frame, and other information" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -903,6 +958,7 @@ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipyth name = "fastjsonschema" version = "2.19.0" description = "Fastest Python implementation of JSON schema" +category = "main" optional = false python-versions = "*" files = [ @@ -917,6 +973,7 @@ devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benc name = "flake8" version = "6.1.0" description = "the modular source code checker: pep8 pyflakes and co" +category = "dev" optional = false python-versions = ">=3.8.1" files = [ @@ -933,6 +990,7 @@ pyflakes = ">=3.1.0,<3.2.0" name = "flask" version = "2.1.3" description = "A simple framework for building complex web applications." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -955,6 +1013,7 @@ dotenv = ["python-dotenv"] name = "flask-cors" version = "3.0.10" description = "A Flask extension adding a decorator for CORS support" +category = "main" optional = false python-versions = "*" files = [ @@ -970,6 +1029,7 @@ Six = "*" name = "fqdn" version = "1.5.1" description = "Validates fully-qualified domain names against RFC 1123, so that they are acceptable to modern bowsers" +category = "main" optional = false python-versions = ">=2.7, !=3.0, !=3.1, !=3.2, !=3.3, !=3.4, <4" files = [ @@ -981,6 +1041,7 @@ files = [ name = "google-api-core" version = "2.14.0" description = "Google API client core library" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1003,6 +1064,7 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] name = "google-api-python-client" version = "2.108.0" description = "Google API Client Library for Python" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1011,7 +1073,7 @@ files = [ ] [package.dependencies] -google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0.dev0" +google-api-core = ">=1.31.5,<2.0.0 || >2.3.0,<3.0.0.dev0" google-auth = ">=1.19.0,<3.0.0.dev0" google-auth-httplib2 = ">=0.1.0" httplib2 = ">=0.15.0,<1.dev0" @@ -1021,6 +1083,7 @@ uritemplate = ">=3.0.1,<5" name = "google-auth" version = "2.23.4" description = "Google Authentication Library" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1044,6 +1107,7 @@ requests = ["requests (>=2.20.0,<3.0.0.dev0)"] name = "google-auth-httplib2" version = "0.1.1" description = "Google Authentication Library: httplib2 transport" +category = "main" optional = false python-versions = "*" files = [ @@ -1059,6 +1123,7 @@ httplib2 = ">=0.19.0" name = "google-auth-oauthlib" version = "0.8.0" description = "Google Authentication Library" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1077,6 +1142,7 @@ tool = ["click (>=6.0.0)"] name = "googleapis-common-protos" version = "1.61.0" description = "Common protobufs used in Google APIs" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1094,6 +1160,7 @@ grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] name = "graphviz" version = "0.20.1" description = "Simple Python interface for Graphviz" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1110,6 +1177,7 @@ test = ["coverage", "mock (>=4)", "pytest (>=7)", "pytest-cov", "pytest-mock (>= name = "great-expectations" version = "0.15.50" description = "Always know what to expect from your data." +category = "main" optional = false python-versions = "*" files = [ @@ -1184,6 +1252,7 @@ vertica = ["sqlalchemy (>=1.3.18,<2.0.0)", "sqlalchemy-vertica-python (>=0.5.10) name = "greenlet" version = "3.0.1" description = "Lightweight in-process concurrent programming" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1254,6 +1323,7 @@ test = ["objgraph", "psutil"] name = "httplib2" version = "0.22.0" description = "A comprehensive HTTP client library." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1268,6 +1338,7 @@ pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0 name = "idna" version = "3.6" description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -1279,6 +1350,7 @@ files = [ name = "imagesize" version = "1.4.1" description = "Getting image size from png/jpeg/jpeg2000/gif file" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -1290,6 +1362,7 @@ files = [ name = "importlib-metadata" version = "6.8.0" description = "Read metadata from Python packages" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1309,6 +1382,7 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs name = "inflection" version = "0.5.1" description = "A port of Ruby on Rails inflector to Python" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -1320,6 +1394,7 @@ files = [ name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1331,6 +1406,7 @@ files = [ name = "interrogate" version = "1.5.0" description = "Interrogate a codebase for docstring coverage." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1356,6 +1432,7 @@ tests = ["pytest", "pytest-cov", "pytest-mock"] name = "ipykernel" version = "6.27.1" description = "IPython Kernel for Jupyter" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1369,7 +1446,7 @@ comm = ">=0.1.1" debugpy = ">=1.6.5" ipython = ">=7.23.1" jupyter-client = ">=6.1.12" -jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" +jupyter-core = ">=4.12,<5.0.0 || >=5.1.0" matplotlib-inline = ">=0.1" nest-asyncio = "*" packaging = "*" @@ -1389,6 +1466,7 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio" name = "ipython" version = "8.18.1" description = "IPython: Productive Interactive Computing" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -1426,6 +1504,7 @@ test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.22)", "pa name = "ipywidgets" version = "8.1.1" description = "Jupyter interactive widgets" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1447,6 +1526,7 @@ test = ["ipykernel", "jsonschema", "pytest (>=3.6.0)", "pytest-cov", "pytz"] name = "isodate" version = "0.6.1" description = "An ISO 8601 date/time/duration parser and formatter" +category = "main" optional = false python-versions = "*" files = [ @@ -1461,6 +1541,7 @@ six = "*" name = "isoduration" version = "20.11.0" description = "Operations with ISO 8601 durations" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1475,6 +1556,7 @@ arrow = ">=0.15.0" name = "isort" version = "5.12.0" description = "A Python utility / library to sort Python imports." +category = "dev" optional = false python-versions = ">=3.8.0" files = [ @@ -1492,6 +1574,7 @@ requirements-deprecated-finder = ["pip-api", "pipreqs"] name = "itsdangerous" version = "2.1.2" description = "Safely pass data to untrusted environments and back." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1503,6 +1586,7 @@ files = [ name = "jedi" version = "0.19.1" description = "An autocompletion tool for Python that can be used for text editors." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1522,6 +1606,7 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] name = "jeepney" version = "0.8.0" description = "Low-level, pure Python DBus protocol wrapper." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1537,6 +1622,7 @@ trio = ["async_generator", "trio"] name = "jinja2" version = "3.1.2" description = "A very fast and expressive template engine." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1554,6 +1640,7 @@ i18n = ["Babel (>=2.7)"] name = "json5" version = "0.9.14" description = "A Python implementation of the JSON5 data format." +category = "main" optional = false python-versions = "*" files = [ @@ -1568,6 +1655,7 @@ dev = ["hypothesis"] name = "jsonpatch" version = "1.33" description = "Apply JSON-Patches (RFC 6902)" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ @@ -1582,6 +1670,7 @@ jsonpointer = ">=1.9" name = "jsonpointer" version = "2.4" description = "Identify specific nodes in a JSON document (RFC 6901)" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ @@ -1593,6 +1682,7 @@ files = [ name = "jsonschema" version = "4.20.0" description = "An implementation of JSON Schema validation for Python" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1622,6 +1712,7 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339- name = "jsonschema-specifications" version = "2023.11.1" description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1636,6 +1727,7 @@ referencing = ">=0.31.0" name = "jupyter-client" version = "8.6.0" description = "Jupyter protocol implementation and client libraries" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1645,7 +1737,7 @@ files = [ [package.dependencies] importlib-metadata = {version = ">=4.8.3", markers = "python_version < \"3.10\""} -jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" +jupyter-core = ">=4.12,<5.0.0 || >=5.1.0" python-dateutil = ">=2.8.2" pyzmq = ">=23.0" tornado = ">=6.2" @@ -1659,6 +1751,7 @@ test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko", "pre-commit", "pyt name = "jupyter-core" version = "5.5.0" description = "Jupyter core package. A base package on which Jupyter projects rely." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1679,6 +1772,7 @@ test = ["ipykernel", "pre-commit", "pytest", "pytest-cov", "pytest-timeout"] name = "jupyter-events" version = "0.9.0" description = "Jupyter Event System library" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1704,6 +1798,7 @@ test = ["click", "pre-commit", "pytest (>=7.0)", "pytest-asyncio (>=0.19.0)", "p name = "jupyter-lsp" version = "2.2.1" description = "Multi-Language Server WebSocket proxy for Jupyter Notebook/Lab server" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1719,6 +1814,7 @@ jupyter-server = ">=1.1.2" name = "jupyter-server" version = "2.11.1" description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1755,6 +1851,7 @@ test = ["flaky", "ipykernel", "pre-commit", "pytest (>=7.0)", "pytest-console-sc name = "jupyter-server-terminals" version = "0.4.4" description = "A Jupyter Server Extension Providing Terminals." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1774,6 +1871,7 @@ test = ["coverage", "jupyter-server (>=2.0.0)", "pytest (>=7.0)", "pytest-cov", name = "jupyterlab" version = "4.0.9" description = "JupyterLab computational environment" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1806,6 +1904,7 @@ test = ["coverage", "pytest (>=7.0)", "pytest-check-links (>=0.7)", "pytest-cons name = "jupyterlab-pygments" version = "0.3.0" description = "Pygments theme using JupyterLab CSS variables" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1817,6 +1916,7 @@ files = [ name = "jupyterlab-server" version = "2.25.2" description = "A set of server components for JupyterLab and JupyterLab like applications." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -1843,6 +1943,7 @@ test = ["hatch", "ipykernel", "openapi-core (>=0.18.0,<0.19.0)", "openapi-spec-v name = "jupyterlab-widgets" version = "3.0.9" description = "Jupyter interactive widgets for JupyterLab" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1854,6 +1955,7 @@ files = [ name = "keyring" version = "23.4.1" description = "Store and access your passwords safely." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -1875,6 +1977,7 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", name = "keyrings-alt" version = "3.1" description = "Alternate keyring implementations" +category = "main" optional = false python-versions = ">=2.7" files = [ @@ -1893,6 +1996,7 @@ testing = ["backports.unittest-mock", "collective.checkdocs", "fs (>=0.5,<2)", " name = "lazy-object-proxy" version = "1.9.0" description = "A fast and thorough lazy object proxy." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -1938,6 +2042,7 @@ files = [ name = "makefun" version = "1.15.2" description = "Small library to dynamically create python functions." +category = "main" optional = false python-versions = "*" files = [ @@ -1949,6 +2054,7 @@ files = [ name = "markupsafe" version = "2.1.0" description = "Safely add untrusted strings to HTML/XML markup." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1998,6 +2104,7 @@ files = [ name = "marshmallow" version = "3.20.1" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2018,6 +2125,7 @@ tests = ["pytest", "pytz", "simplejson"] name = "matplotlib-inline" version = "0.1.6" description = "Inline Matplotlib backend for Jupyter" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -2032,6 +2140,7 @@ traitlets = "*" name = "mccabe" version = "0.7.0" description = "McCabe checker, plugin for flake8" +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -2043,6 +2152,7 @@ files = [ name = "mistune" version = "3.0.2" description = "A sane and fast Markdown parser with useful plugins and renderers" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2054,6 +2164,7 @@ files = [ name = "mypy" version = "1.7.1" description = "Optional static typing for Python" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -2101,6 +2212,7 @@ reports = ["lxml"] name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -2112,6 +2224,7 @@ files = [ name = "nbclient" version = "0.9.0" description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor." +category = "main" optional = false python-versions = ">=3.8.0" files = [ @@ -2121,7 +2234,7 @@ files = [ [package.dependencies] jupyter-client = ">=6.1.12" -jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" +jupyter-core = ">=4.12,<5.0.0 || >=5.1.0" nbformat = ">=5.1" traitlets = ">=5.4" @@ -2134,6 +2247,7 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>= name = "nbconvert" version = "7.11.0" description = "Converting Jupyter Notebooks" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2172,6 +2286,7 @@ webpdf = ["playwright"] name = "nbformat" version = "5.9.2" description = "The Jupyter Notebook format" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2193,6 +2308,7 @@ test = ["pep440", "pre-commit", "pytest", "testpath"] name = "nest-asyncio" version = "1.5.8" description = "Patch asyncio to allow nested event loops" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -2204,6 +2320,7 @@ files = [ name = "networkx" version = "2.8.8" description = "Python package for creating and manipulating graphs and networks" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2222,6 +2339,7 @@ test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] name = "notebook" version = "7.0.6" description = "Jupyter Notebook - A web-based notebook environment for interactive computing" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2245,6 +2363,7 @@ test = ["importlib-resources (>=5.0)", "ipykernel", "jupyter-server[test] (>=2.4 name = "notebook-shim" version = "0.2.3" description = "A shim layer for notebook traits and config" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2262,6 +2381,7 @@ test = ["pytest", "pytest-console-scripts", "pytest-jupyter", "pytest-tornasync" name = "numpy" version = "1.26.2" description = "Fundamental package for array computing in Python" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -2307,6 +2427,7 @@ files = [ name = "oauth2client" version = "4.1.3" description = "OAuth 2.0 client library" +category = "main" optional = false python-versions = "*" files = [ @@ -2325,6 +2446,7 @@ six = ">=1.6.1" name = "oauthlib" version = "3.2.2" description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2341,6 +2463,7 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] name = "openpyxl" version = "3.1.2" description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2450,6 +2573,7 @@ files = [ name = "overrides" version = "7.4.0" description = "A decorator to automatically detect mismatch when overriding a method." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2461,6 +2585,7 @@ files = [ name = "packaging" version = "23.2" description = "Core utilities for Python packages" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2472,6 +2597,7 @@ files = [ name = "pandarallel" version = "1.6.5" description = "An easy to use library to speed up computation (by parallelizing on multi CPUs) with pandas." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2491,6 +2617,7 @@ doc = ["mkdocs-material"] name = "pandas" version = "1.5.3" description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2538,6 +2665,7 @@ test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] name = "pandocfilters" version = "1.5.0" description = "Utilities for writing pandoc filters in python" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -2549,6 +2677,7 @@ files = [ name = "parso" version = "0.8.3" description = "A Python Parser" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -2564,6 +2693,7 @@ testing = ["docopt", "pytest (<6.0.0)"] name = "pathspec" version = "0.11.2" description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2575,6 +2705,7 @@ files = [ name = "pdoc" version = "12.3.1" description = "API Documentation for Python Projects" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2594,6 +2725,7 @@ dev = ["black", "hypothesis", "mypy", "pytest", "pytest-cov", "pytest-timeout", name = "pexpect" version = "4.9.0" description = "Pexpect allows easy control of interactive console applications." +category = "main" optional = false python-versions = "*" files = [ @@ -2608,6 +2740,7 @@ ptyprocess = ">=0.5" name = "platformdirs" version = "4.0.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2623,6 +2756,7 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-co name = "pluggy" version = "1.3.0" description = "plugin and hook calling mechanisms for python" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -2638,6 +2772,7 @@ testing = ["pytest", "pytest-benchmark"] name = "prometheus-client" version = "0.19.0" description = "Python client for the Prometheus monitoring system." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2652,6 +2787,7 @@ twisted = ["twisted"] name = "prompt-toolkit" version = "3.0.41" description = "Library for building powerful interactive command lines in Python" +category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -2666,6 +2802,7 @@ wcwidth = "*" name = "protobuf" version = "4.25.1" description = "" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -2686,6 +2823,7 @@ files = [ name = "psutil" version = "5.9.6" description = "Cross-platform lib for process and system monitoring in Python." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -2714,6 +2852,7 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] name = "ptyprocess" version = "0.7.0" description = "Run a subprocess in a pseudo terminal" +category = "main" optional = false python-versions = "*" files = [ @@ -2725,6 +2864,7 @@ files = [ name = "pure-eval" version = "0.2.2" description = "Safely evaluate AST nodes without side effects" +category = "main" optional = false python-versions = "*" files = [ @@ -2739,6 +2879,7 @@ tests = ["pytest"] name = "py" version = "1.11.0" description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -2750,6 +2891,7 @@ files = [ name = "pyasn1" version = "0.5.1" description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ @@ -2761,6 +2903,7 @@ files = [ name = "pyasn1-modules" version = "0.3.0" description = "A collection of ASN.1-based protocols modules" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ @@ -2775,6 +2918,7 @@ pyasn1 = ">=0.4.6,<0.6.0" name = "pycodestyle" version = "2.11.1" description = "Python style guide checker" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -2786,6 +2930,7 @@ files = [ name = "pycparser" version = "2.21" description = "C parser in Python" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -2797,6 +2942,7 @@ files = [ name = "pydantic" version = "1.10.13" description = "Data validation and settings management using python type hints" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2849,6 +2995,7 @@ email = ["email-validator (>=1.0.3)"] name = "pyflakes" version = "3.1.0" description = "passive checker of Python programs" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -2860,6 +3007,7 @@ files = [ name = "pygments" version = "2.17.2" description = "Pygments is a syntax highlighting package written in Python." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2875,6 +3023,7 @@ windows-terminal = ["colorama (>=0.4.6)"] name = "pygsheets" version = "2.0.6" description = "Google Spreadsheets Python API v4" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -2893,6 +3042,7 @@ pandas = ["pandas (>=0.14.0)"] name = "pylint" version = "2.17.7" description = "python code static checker" +category = "dev" optional = false python-versions = ">=3.7.2" files = [ @@ -2919,6 +3069,7 @@ testutils = ["gitpython (>3)"] name = "pyopenssl" version = "23.3.0" description = "Python wrapper module around the OpenSSL library" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2937,6 +3088,7 @@ test = ["flaky", "pretend", "pytest (>=3.0.1)"] name = "pyparsing" version = "3.1.1" description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "main" optional = false python-versions = ">=3.6.8" files = [ @@ -2951,6 +3103,7 @@ diagrams = ["jinja2", "railroad-diagrams"] name = "pytest" version = "7.4.3" description = "pytest: simple powerful testing with Python" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2973,6 +3126,7 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "pytest-cov" version = "4.1.0" description = "Pytest plugin for measuring coverage." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2991,6 +3145,7 @@ testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtuale name = "pytest-mock" version = "3.12.0" description = "Thin-wrapper around the mock package for easier use with pytest" +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -3043,6 +3198,7 @@ testing = ["filelock"] name = "python-dateutil" version = "2.8.2" description = "Extensions to the standard Python datetime module" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -3057,6 +3213,7 @@ six = ">=1.5" name = "python-dotenv" version = "0.21.1" description = "Read key-value pairs from a .env file and set them as environment variables" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -3071,6 +3228,7 @@ cli = ["click (>=5.0)"] name = "python-json-logger" version = "2.0.7" description = "A python library adding a json log formatter" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3082,6 +3240,7 @@ files = [ name = "pytz" version = "2023.3.post1" description = "World timezone definitions, modern and historical" +category = "main" optional = false python-versions = "*" files = [ @@ -3093,6 +3252,7 @@ files = [ name = "pywin32" version = "306" description = "Python for Window Extensions" +category = "main" optional = false python-versions = "*" files = [ @@ -3116,6 +3276,7 @@ files = [ name = "pywin32-ctypes" version = "0.2.2" description = "A (partial) reimplementation of pywin32 using ctypes/cffi" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3127,6 +3288,7 @@ files = [ name = "pywinpty" version = "2.0.12" description = "Pseudo terminal support for Windows from Python." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3142,6 +3304,7 @@ files = [ name = "pyyaml" version = "6.0.1" description = "YAML parser and emitter for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3201,6 +3364,7 @@ files = [ name = "pyzmq" version = "25.1.1" description = "Python bindings for 0MQ" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3306,6 +3470,7 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""} name = "rdflib" version = "6.3.2" description = "RDFLib is a Python library for working with RDF, a simple yet powerful language for representing information." +category = "main" optional = false python-versions = ">=3.7,<4.0" files = [ @@ -3327,6 +3492,7 @@ networkx = ["networkx (>=2.0.0,<3.0.0)"] name = "referencing" version = "0.31.0" description = "JSON Referencing + Python" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3342,6 +3508,7 @@ rpds-py = ">=0.7.0" name = "regex" version = "2023.10.3" description = "Alternative regular expression module, to replace re." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3439,6 +3606,7 @@ files = [ name = "requests" version = "2.31.0" description = "Python HTTP for Humans." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3460,6 +3628,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "requests-oauthlib" version = "1.3.1" description = "OAuthlib authentication support for Requests." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -3478,6 +3647,7 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] name = "rfc3339-validator" version = "0.1.4" description = "A pure python RFC3339 validator" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -3492,6 +3662,7 @@ six = "*" name = "rfc3986-validator" version = "0.1.1" description = "Pure python rfc3986 validator" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -3503,6 +3674,7 @@ files = [ name = "rpds-py" version = "0.13.1" description = "Python bindings to Rust's persistent data structures (rpds)" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3611,6 +3783,7 @@ files = [ name = "rsa" version = "4.9" description = "Pure-Python RSA implementation" +category = "main" optional = false python-versions = ">=3.6,<4" files = [ @@ -3625,6 +3798,7 @@ pyasn1 = ">=0.1.3" name = "ruamel-yaml" version = "0.17.17" description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" +category = "main" optional = false python-versions = ">=3" files = [ @@ -3643,6 +3817,7 @@ jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"] name = "ruamel-yaml-clib" version = "0.2.8" description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3675,24 +3850,18 @@ files = [ {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-macosx_12_0_arm64.whl", hash = "sha256:f481f16baec5290e45aebdc2a5168ebc6d35189ae6fea7a58787613a25f6e875"}, {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:77159f5d5b5c14f7c34073862a6b7d34944075d9f93e681638f6d753606c6ce6"}, {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7f67a1ee819dc4562d444bbafb135832b0b909f81cc90f7aa00260968c9ca1b3"}, - {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4ecbf9c3e19f9562c7fdd462e8d18dd902a47ca046a2e64dba80699f0b6c09b7"}, - {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:87ea5ff66d8064301a154b3933ae406b0863402a799b16e4a1d24d9fbbcbe0d3"}, {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-win32.whl", hash = "sha256:75e1ed13e1f9de23c5607fe6bd1aeaae21e523b32d83bb33918245361e9cc51b"}, {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-win_amd64.whl", hash = "sha256:3f215c5daf6a9d7bbed4a0a4f760f3113b10e82ff4c5c44bec20a68c8014f675"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1b617618914cb00bf5c34d4357c37aa15183fa229b24767259657746c9077615"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a6a9ffd280b71ad062eae53ac1659ad86a17f59a0fdc7699fd9be40525153337"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:305889baa4043a09e5b76f8e2a51d4ffba44259f6b4c72dec8ca56207d9c6fe1"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:700e4ebb569e59e16a976857c8798aee258dceac7c7d6b50cab63e080058df91"}, - {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e2b4c44b60eadec492926a7270abb100ef9f72798e18743939bdbf037aab8c28"}, - {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e79e5db08739731b0ce4850bed599235d601701d5694c36570a99a0c5ca41a9d"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-win32.whl", hash = "sha256:955eae71ac26c1ab35924203fda6220f84dce57d6d7884f189743e2abe3a9fbe"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-win_amd64.whl", hash = "sha256:56f4252222c067b4ce51ae12cbac231bce32aee1d33fbfc9d17e5b8d6966c312"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:03d1162b6d1df1caa3a4bd27aa51ce17c9afc2046c31b0ad60a0a96ec22f8001"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:bba64af9fa9cebe325a62fa398760f5c7206b215201b0ec825005f1b18b9bccf"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:a1a45e0bb052edf6a1d3a93baef85319733a888363938e1fc9924cb00c8df24c"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:da09ad1c359a728e112d60116f626cc9f29730ff3e0e7db72b9a2dbc2e4beed5"}, - {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:184565012b60405d93838167f425713180b949e9d8dd0bbc7b49f074407c5a8b"}, - {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a75879bacf2c987c003368cf14bed0ffe99e8e85acfa6c0bfffc21a090f16880"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-win32.whl", hash = "sha256:84b554931e932c46f94ab306913ad7e11bba988104c5cff26d90d03f68258cd5"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-win_amd64.whl", hash = "sha256:25ac8c08322002b06fa1d49d1646181f0b2c72f5cbc15a85e80b4c30a544bb15"}, {file = "ruamel.yaml.clib-0.2.8.tar.gz", hash = "sha256:beb2e0404003de9a4cab9753a8805a8fe9320ee6673136ed7f04255fe60bb512"}, @@ -3702,6 +3871,7 @@ files = [ name = "schematic-db" version = "0.0.dev33" description = "" +category = "main" optional = false python-versions = ">=3.9,<4.0" files = [ @@ -3732,6 +3902,7 @@ synapse = ["synapseclient (>=3.0.0,<4.0.0)"] name = "scipy" version = "1.11.4" description = "Fundamental algorithms for scientific computing in Python" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -3774,6 +3945,7 @@ test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeo name = "secretstorage" version = "3.3.3" description = "Python bindings to FreeDesktop.org Secret Service API" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -3789,6 +3961,7 @@ jeepney = ">=0.6" name = "send2trash" version = "1.8.2" description = "Send file to trash natively under Mac OS X, Windows and Linux" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ @@ -3805,6 +3978,7 @@ win32 = ["pywin32"] name = "setuptools" version = "66.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3821,6 +3995,7 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs ( name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -3832,6 +4007,7 @@ files = [ name = "sniffio" version = "1.3.0" description = "Sniff out which async library your code is running under" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3843,6 +4019,7 @@ files = [ name = "snowballstemmer" version = "2.2.0" description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." +category = "main" optional = false python-versions = "*" files = [ @@ -3854,6 +4031,7 @@ files = [ name = "soupsieve" version = "2.5" description = "A modern CSS selector implementation for Beautiful Soup." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -3865,6 +4043,7 @@ files = [ name = "sphinx" version = "7.2.6" description = "Python documentation generator" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -3900,6 +4079,7 @@ test = ["cython (>=3.0)", "filelock", "html5lib", "pytest (>=4.6)", "setuptools name = "sphinx-click" version = "4.4.0" description = "Sphinx extension that automatically documents click applications" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -3916,6 +4096,7 @@ sphinx = ">=2.0" name = "sphinxcontrib-applehelp" version = "1.0.7" description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -3934,6 +4115,7 @@ test = ["pytest"] name = "sphinxcontrib-devhelp" version = "1.0.5" description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp documents" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -3952,6 +4134,7 @@ test = ["pytest"] name = "sphinxcontrib-htmlhelp" version = "2.0.4" description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -3970,6 +4153,7 @@ test = ["html5lib", "pytest"] name = "sphinxcontrib-jsmath" version = "1.0.1" description = "A sphinx extension which renders display math in HTML via JavaScript" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -3984,6 +4168,7 @@ test = ["flake8", "mypy", "pytest"] name = "sphinxcontrib-qthelp" version = "1.0.6" description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp documents" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -4002,6 +4187,7 @@ test = ["pytest"] name = "sphinxcontrib-serializinghtml" version = "1.1.9" description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)" +category = "main" optional = false python-versions = ">=3.9" files = [ @@ -4020,6 +4206,7 @@ test = ["pytest"] name = "sqlalchemy" version = "2.0.23" description = "Database Abstraction Library" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4107,6 +4294,7 @@ sqlcipher = ["sqlcipher3-binary"] name = "sqlalchemy-utils" version = "0.41.1" description = "Various utility functions for SQLAlchemy." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4135,6 +4323,7 @@ url = ["furl (>=0.4.1)"] name = "stack-data" version = "0.6.3" description = "Extract data from python stack frames and tracebacks for informative displays" +category = "main" optional = false python-versions = "*" files = [ @@ -4154,6 +4343,7 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] name = "swagger-ui-bundle" version = "0.0.9" description = "swagger_ui_bundle - swagger-ui files in a pip package" +category = "main" optional = false python-versions = "*" files = [ @@ -4168,6 +4358,7 @@ Jinja2 = ">=2.0" name = "synapseclient" version = "3.2.0" description = "A client for Synapse, a collaborative, open-source research platform that allows teams to share data, track analyses, and collaborate." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -4197,6 +4388,7 @@ tests = ["flake8 (>=3.7.0,<4.0)", "func-timeout (>=4.3,<5.0)", "pytest (>=6.0.0, name = "tabulate" version = "0.9.0" description = "Pretty-print tabular data" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4211,6 +4403,7 @@ widechars = ["wcwidth"] name = "tenacity" version = "8.2.3" description = "Retry code until it succeeds" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4225,6 +4418,7 @@ doc = ["reno", "sphinx", "tornado (>=4.5)"] name = "terminado" version = "0.18.0" description = "Tornado websocket backend for the Xterm.js Javascript terminal emulator library." +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -4246,6 +4440,7 @@ typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"] name = "tinycss2" version = "1.2.1" description = "A tiny CSS parser" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4264,6 +4459,7 @@ test = ["flake8", "isort", "pytest"] name = "toml" version = "0.10.2" description = "Python Library for Tom's Obvious, Minimal Language" +category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -4275,6 +4471,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4286,6 +4483,7 @@ files = [ name = "tomlkit" version = "0.12.3" description = "Style preserving TOML library" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -4297,6 +4495,7 @@ files = [ name = "toolz" version = "0.12.0" description = "List processing tools and functional utilities" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -4308,6 +4507,7 @@ files = [ name = "tornado" version = "6.3.3" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." +category = "main" optional = false python-versions = ">= 3.8" files = [ @@ -4328,6 +4528,7 @@ files = [ name = "tqdm" version = "4.66.1" description = "Fast, Extensible Progress Meter" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4348,6 +4549,7 @@ telegram = ["requests"] name = "traitlets" version = "5.14.0" description = "Traitlets Python configuration system" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -4363,6 +4565,7 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, name = "types-python-dateutil" version = "2.8.19.14" description = "Typing stubs for python-dateutil" +category = "main" optional = false python-versions = "*" files = [ @@ -4374,6 +4577,7 @@ files = [ name = "typing-extensions" version = "4.5.0" description = "Backported and Experimental Type Hints for Python 3.7+" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4381,10 +4585,27 @@ files = [ {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, ] +[[package]] +name = "typing-inspect" +version = "0.9.0" +description = "Runtime inspection utilities for typing module." +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, + {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"}, +] + +[package.dependencies] +mypy-extensions = ">=0.3.0" +typing-extensions = ">=3.7.4" + [[package]] name = "tzdata" version = "2023.3" description = "Provider of IANA time zone data" +category = "main" optional = false python-versions = ">=2" files = [ @@ -4396,6 +4617,7 @@ files = [ name = "tzlocal" version = "5.2" description = "tzinfo object for the local timezone" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -4413,6 +4635,7 @@ devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3) name = "uri-template" version = "1.3.0" description = "RFC 6570 URI Template Processor" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4427,6 +4650,7 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake name = "uritemplate" version = "4.1.1" description = "Implementation of RFC 6570 URI Templates" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4438,6 +4662,7 @@ files = [ name = "urllib3" version = "1.26.18" description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -4454,6 +4679,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] name = "uwsgi" version = "2.0.23" description = "The uWSGI server" +category = "dev" optional = false python-versions = "*" files = [ @@ -4464,6 +4690,7 @@ files = [ name = "validators" version = "0.20.0" description = "Python Data Validation for Humans™." +category = "main" optional = false python-versions = ">=3.4" files = [ @@ -4480,6 +4707,7 @@ test = ["flake8 (>=2.4.0)", "isort (>=4.2.2)", "pytest (>=2.2.3)"] name = "wcwidth" version = "0.2.12" description = "Measures the displayed width of unicode strings in a terminal" +category = "main" optional = false python-versions = "*" files = [ @@ -4491,6 +4719,7 @@ files = [ name = "webcolors" version = "1.13" description = "A library for working with the color formats defined by HTML and CSS." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4506,6 +4735,7 @@ tests = ["pytest", "pytest-cov"] name = "webencodings" version = "0.5.1" description = "Character encoding aliases for legacy web content" +category = "main" optional = false python-versions = "*" files = [ @@ -4517,6 +4747,7 @@ files = [ name = "websocket-client" version = "1.6.4" description = "WebSocket client for Python with low level API options" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -4533,6 +4764,7 @@ test = ["websockets"] name = "werkzeug" version = "2.1.2" description = "The comprehensive WSGI web application library." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4547,6 +4779,7 @@ watchdog = ["watchdog"] name = "widgetsnbextension" version = "4.0.9" description = "Jupyter interactive widgets for Jupyter Notebook" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -4558,6 +4791,7 @@ files = [ name = "wrapt" version = "1.16.0" description = "Module for decorators, wrappers and monkey patching." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -4637,6 +4871,7 @@ files = [ name = "zipp" version = "3.17.0" description = "Backport of pathlib-compatible object wrapper for zip files" +category = "main" optional = false python-versions = ">=3.8" files = [ diff --git a/pyproject.toml b/pyproject.toml index c7c415349..205030f32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ pandarallel = "^1.6.4" schematic-db = {version = "0.0.dev33", extras = ["synapse"]} pyopenssl = "^23.0.0" typing-extensions = "<4.6.0" +dataclasses-json = "^0.6.1" [tool.poetry.group.dev.dependencies] pytest = "^7.0.0" diff --git a/schematic/help.py b/schematic/help.py index c738df1bc..c243a10ab 100644 --- a/schematic/help.py +++ b/schematic/help.py @@ -166,9 +166,6 @@ "short_help": ( "Convert specification from CSV data model to JSON-LD data model." ), - "base_schema": ( - "Path to base data model. BioThings data model is loaded by default." - ), "output_jsonld": ( "Path to where the generated JSON-LD file needs to be outputted." ), diff --git a/schematic/manifest/commands.py b/schematic/manifest/commands.py index a75aa7216..1f916b05c 100644 --- a/schematic/manifest/commands.py +++ b/schematic/manifest/commands.py @@ -6,11 +6,14 @@ import click import click_log +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.manifest.generator import ManifestGenerator + from schematic.utils.cli_utils import log_value_from_config, query_dict, parse_synIDs -from schematic.help import manifest_commands -from schematic.schemas.generator import SchemaGenerator from schematic.utils.google_api_utils import export_manifest_csv +from schematic.help import manifest_commands + from schematic.store.synapse import SynapseStorage from schematic.configuration.configuration import CONFIG @@ -59,7 +62,7 @@ def manifest(ctx, config): # use as `schematic manifest ...` help=query_dict(manifest_commands, ("manifest", "get", "data_type")), ) @click.option( - "-p", "--jsonld", help=query_dict(manifest_commands, ("manifest", "get", "jsonld")) + "-p", "--path_to_data_model", help=query_dict(manifest_commands, ("manifest", "get", "path_to_data_model")) ) @click.option( "-d", @@ -104,7 +107,7 @@ def get_manifest( ctx, title, data_type, - jsonld, + path_to_data_model, dataset_id, sheet_url, output_csv, @@ -121,17 +124,31 @@ def get_manifest( if data_type is None: data_type = CONFIG.manifest_data_type log_value_from_config("data_type", data_type) - if jsonld is None: - jsonld = CONFIG.model_location - log_value_from_config("jsonld", jsonld) + if path_to_data_model is None: + path_to_data_model = CONFIG.model_location + log_value_from_config("path_to_data_model", path_to_data_model) if title is None: title = CONFIG.manifest_title log_value_from_config("title", title) + data_model_parser = DataModelParser(path_to_data_model = path_to_data_model) + + #Parse Model + logger.info("Parsing data model.") + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + logger.info("Generating data model graph.") + graph_data_model = data_model_grapher.generate_data_model_graph() + def create_single_manifest(data_type, output_csv=None, output_xlsx=None): # create object of type ManifestGenerator manifest_generator = ManifestGenerator( - path_to_json_ld=jsonld, + path_to_data_model=path_to_data_model, + graph = graph_data_model, title=t, root=data_type, use_annotations=use_annotations, @@ -174,7 +191,7 @@ def create_single_manifest(data_type, output_csv=None, output_xlsx=None): logger.info("Find the manifest template using this Google Sheet URL:") click.echo(result) if output_csv is None and output_xlsx is None: - prefix, _ = os.path.splitext(jsonld) + prefix, _ = os.path.splitext(path_to_data_model) prefix_root, prefix_ext = os.path.splitext(prefix) if prefix_ext == ".model": prefix = prefix_root @@ -194,9 +211,10 @@ def create_single_manifest(data_type, output_csv=None, output_xlsx=None): if type(data_type) is str: data_type = [data_type] - if data_type[0] == 'all manifests': - sg = SchemaGenerator(path_to_json_ld=jsonld) - component_digraph = sg.se.get_digraph_by_edge_type('requiresComponent') + if data_type[0] == 'all manifests': + # Feed graph into the data model graph explorer + dmge = DataModelGraphExplorer(graph_data_model) + component_digraph = dmge.get_digraph_by_edge_type('requiresComponent') components = component_digraph.nodes() for component in components: t = f'{title}.{component}.manifest' diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py index fa842eeb5..049941ff2 100644 --- a/schematic/manifest/generator.py +++ b/schematic/manifest/generator.py @@ -1,6 +1,7 @@ from collections import OrderedDict import json import logging +import networkx as nx from openpyxl.styles import Font, Alignment, PatternFill from openpyxl import load_workbook from openpyxl.utils.dataframe import dataframe_to_rows @@ -12,7 +13,10 @@ from typing import Dict, List, Optional, Tuple, Union, BinaryIO, Literal from flask import send_from_directory -from schematic.schemas.generator import SchemaGenerator +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_json_schema import DataModelJSONSchema + from schematic.utils.google_api_utils import ( execute_google_api_requests, build_service_account_creds, @@ -35,7 +39,8 @@ class ManifestGenerator(object): def __init__( self, - path_to_json_ld: str, # JSON-LD file to be used for generating the manifest + path_to_data_model: str, # JSON-LD file to be used for generating the manifest + graph: nx.MultiDiGraph, # At this point, the graph is fully formed. alphabetize_valid_values: str = 'ascending', title: str = None, # manifest sheet title root: str = None, @@ -54,6 +59,12 @@ def __init__( # google service credentials object self.creds = services_creds["creds"] + # Path to jsonld + self.model_path = path_to_data_model + + # Graph + self.graph = graph + # schema root if root: self.root = root @@ -79,14 +90,14 @@ def __init__( "when there is no manifest file for the dataset in question." ) - # SchemaGenerator() object - self.sg = SchemaGenerator(path_to_json_ld) + # Instantiate Data Model Explorer object + self.dmge = DataModelGraphExplorer(self.graph) # additional metadata to add to manifest self.additional_metadata = additional_metadata # Check if the class is in the schema - root_in_schema = self.sg.se.is_class_in_schema(self.root) + root_in_schema = self.dmge.is_class_in_schema(self.root) # If the class could not be found, give a notification if not root_in_schema: @@ -95,8 +106,7 @@ def __init__( raise LookupError(exception_message) # Determine whether current data type is file-based - self.is_file_based = "Filename" in self.sg.get_node_dependencies(self.root) - + self.is_file_based = "Filename" in self.dmge.get_node_dependencies(self.root) def _attribute_to_letter(self, attribute, manifest_fields): """Map attribute to column letter in a google sheet""" @@ -364,13 +374,12 @@ def _get_json_schema(self, json_schema_filepath: str) -> Dict: json_schema_filepath(str): path to json schema file Returns: Dictionary, containing portions of the json schema + TODO: Do we even allow people to provide a json_schema_filepath anyore? """ if not json_schema_filepath: - # if no json schema is provided; there must be - # schema explorer defined for schema.org schema - # o.w. this will throw an error - # TODO: catch error - json_schema = self.sg.get_json_schema_requirements(self.root, self.title) + # TODO Catch error if no JSONLD or JSON path provided. + data_model_js = DataModelJSONSchema(jsonld_path=self.model_path, graph=self.graph) + json_schema = data_model_js.get_json_validation_schema(source_node=self.root, schema_name=self.title) else: with open(json_schema_filepath) as jsonfile: json_schema = json.load(jsonfile) @@ -813,9 +822,9 @@ def _request_row_format(self, i, req): notes_body["requests"] (dict): with information on note to add to the column header. This notes body will be added to a request. """ - if self.sg.se: + if self.dmge: # get node definition - note = self.sg.get_node_definition(req) + note = self.dmge.get_node_comment(node_display_name = req) notes_body = { "requests": [ @@ -1014,8 +1023,7 @@ def _dependency_formatting( dependency_formatting_body = {"requests": []} for j, val_dep in enumerate(val_dependencies): is_required = False - - if self.sg.is_node_required(val_dep): + if self.dmge.get_node_required(node_display_name=val_dep): is_required = True else: is_required = False @@ -1058,13 +1066,13 @@ def _request_dependency_formatting( for req_val in req_vals: # get this required/valid value's node label in schema, based on display name (i.e. shown to the user in a dropdown to fill in) req_val = req_val["userEnteredValue"] - req_val_node_label = self.sg.get_node_label(req_val) + req_val_node_label = self.dmge.get_node_label(req_val) if not req_val_node_label: # if this node is not in the graph # continue - there are no dependencies for it continue # check if this required/valid value has additional dependency attributes - val_dependencies = self.sg.get_node_dependencies( + val_dependencies = self.dmge.get_node_dependencies( req_val_node_label, schema_ordered=False ) @@ -1117,7 +1125,7 @@ def _create_requests_body( requests_body["requests"] = [] for i, req in enumerate(ordered_metadata_fields[0]): # Gather validation rules and valid values for attribute. - validation_rules = self.sg.get_node_validation_rules(req) + validation_rules = self.dmge.get_node_validation_rules(node_display_name=req) # Add regex match validaiton rule to Google Sheets. if validation_rules and sheet_url: @@ -1364,7 +1372,7 @@ def map_annotation_names_to_display_names( pd.DataFrame: Annotations table with updated column headers. """ # Get list of attribute nodes from data model - model_nodes = self.sg.se.get_nx_schema().nodes + model_nodes = self.graph.nodes # Subset annotations to those appearing as a label in the model labels = filter(lambda x: x in model_nodes, annotations.columns) @@ -1492,7 +1500,7 @@ def _handle_output_format_logic(self, output_format: str = None, output_path: st return dataframe @staticmethod - def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[str]=None, dataset_id:Optional[str]=None, strict:Optional[bool]=True, title:Optional[str]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", use_annotations:Optional[bool]=False) -> Union[str, pd.DataFrame, BinaryIO]: + def create_single_manifest(path_to_data_model: str, graph_data_model: nx.MultiDiGraph, data_type: str, access_token:Optional[str]=None, dataset_id:Optional[str]=None, strict:Optional[bool]=True, title:Optional[str]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", use_annotations:Optional[bool]=False) -> Union[str, pd.DataFrame, BinaryIO]: """Create a single manifest Args: @@ -1510,7 +1518,8 @@ def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[st """ # create object of type ManifestGenerator manifest_generator = ManifestGenerator( - path_to_json_ld=jsonld, + path_to_data_model=path_to_data_model, + graph=graph_data_model, title=title, root=data_type, use_annotations=use_annotations, @@ -1536,11 +1545,11 @@ def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[st return result @staticmethod - def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=None, dataset_ids:Optional[list]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", title:Optional[str]=None, strict:Optional[bool]=True, use_annotations:Optional[bool]=False) -> Union[List[str], List[pd.DataFrame], BinaryIO]: + def create_manifests(path_to_data_model:str, data_types:list, access_token:Optional[str]=None, dataset_ids:Optional[list]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", title:Optional[str]=None, strict:Optional[bool]=True, use_annotations:Optional[bool]=False) -> Union[List[str], List[pd.DataFrame], BinaryIO]: """Create multiple manifests Args: - jsonld (str): jsonld schema + path_to_data_model (str): str path to data model data_type (list): a list of data types access_token (str, optional): synapse access token. Required when getting an existing manifest. Defaults to None. dataset_id (list, optional): a list of dataset ids when generating an existing manifest. Defaults to None. @@ -1552,10 +1561,22 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non Returns: Union[List[str], List[pd.DataFrame], BinaryIO]: a list of Googlesheet URLs, a list of pandas dataframes or an Excel file. """ + data_model_parser = DataModelParser(path_to_data_model = path_to_data_model) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + # Gather all returned result urls all_results = [] if data_types[0] == 'all manifests': - sg = SchemaGenerator(path_to_json_ld=jsonld) - component_digraph = sg.se.get_digraph_by_edge_type('requiresComponent') + dmge = DataModelGraphExplorer(graph_data_model) + component_digraph = dmge.get_digraph_by_edge_type('requiresComponent') components = component_digraph.nodes() for component in components: if title: @@ -1563,7 +1584,7 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non else: t = f'Example.{component}.manifest' if output_format != "excel": - result = ManifestGenerator.create_single_manifest(jsonld=jsonld, data_type=component, output_format=output_format, title=t, access_token=access_token, strict=strict, use_annotations=use_annotations) + result = ManifestGenerator.create_single_manifest(path_to_data_model=path_to_data_model, data_type=component, graph_data_model=graph_data_model, output_format=output_format, title=t, access_token=access_token) all_results.append(result) else: logger.error('Currently we do not support returning multiple files as Excel format at once. Please choose a different output format. ') @@ -1578,9 +1599,9 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non t = title if dataset_ids: # if a dataset_id is provided add this to the function call. - result = ManifestGenerator.create_single_manifest(jsonld=jsonld, data_type=dt, dataset_id=dataset_ids[i], output_format=output_format, title=t, access_token=access_token, strict=strict, use_annotations=use_annotations) + result = ManifestGenerator.create_single_manifest(path_to_data_model=path_to_data_model, data_type=dt, graph_data_model=graph_data_model, dataset_id=dataset_ids[i], output_format=output_format, title=t, access_token=access_token, use_annotations=use_annotations) else: - result = ManifestGenerator.create_single_manifest(jsonld=jsonld, data_type=dt, output_format=output_format, title=t, access_token=access_token, strict=strict, use_annotations=use_annotations) + result = ManifestGenerator.create_single_manifest(path_to_data_model=path_to_data_model, data_type=dt, graph_data_model=graph_data_model, output_format=output_format, title=t, access_token=access_token, use_annotations=use_annotations) # if output is pandas dataframe or google sheet url if isinstance(result, str) or isinstance(result, pd.DataFrame): @@ -1589,6 +1610,7 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non if len(data_types) > 1: logger.warning(f'Currently we do not support returning multiple files as Excel format at once. Only {t} would get returned. ') return result + return all_results @@ -1632,7 +1654,7 @@ def get_manifest( # Get manifest file associated with given dataset (if applicable) # populate manifest with set of new files (if applicable) - manifest_record = store.updateDatasetManifestFiles(self.sg, datasetId = dataset_id, store = False) + manifest_record = store.updateDatasetManifestFiles(self.dmge, datasetId = dataset_id, store = False) # get URL of an empty manifest file created based on schema component empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=True) @@ -1869,9 +1891,9 @@ def sort_manifest_fields(self, manifest_fields, order="schema"): # order manifest fields based on data-model schema if order == "schema": - if self.sg and self.root: + if self.dmge and self.root: # get display names of dependencies - dependencies_display_names = self.sg.get_node_dependencies(self.root) + dependencies_display_names = self.dmge.get_node_dependencies(self.root) # reorder manifest fields so that root dependencies are first and follow schema order manifest_fields = sorted( diff --git a/schematic/models/GE_Helpers.py b/schematic/models/GE_Helpers.py index e4de3310e..521d75157 100644 --- a/schematic/models/GE_Helpers.py +++ b/schematic/models/GE_Helpers.py @@ -25,7 +25,8 @@ from schematic.models.validate_attribute import GenerateError -from schematic.schemas.generator import SchemaGenerator +from schematic.schemas.data_model_graph import DataModelGraphExplorer + from schematic.utils.validate_utils import rule_in_rule_list, np_array_to_str_list, iterable_to_str_list logger = logging.getLogger(__name__) @@ -39,7 +40,7 @@ class GreatExpectationsHelpers(object): 2) Parse results dict to generate appropriate errors """ def __init__(self, - sg, + dmge, unimplemented_expectations, manifest, manifestPath @@ -48,8 +49,8 @@ def __init__(self, Purpose: Instantiate a great expectations helpers object Args: - sg: - schemaGenerator object + dmge: + DataModelGraphExplorer Object unimplemented_expectations: dictionary of validation rules that currently do not have expectations developed manifest: @@ -58,7 +59,7 @@ def __init__(self, path to manifest being validated """ self.unimplemented_expectations = unimplemented_expectations - self.sg = sg + self.dmge = dmge self.manifest = manifest self.manifestPath = manifestPath @@ -150,14 +151,14 @@ def build_expectation_suite(self,): expectation_suite_name=self.expectation_suite_name, ) - #build expectation configurations for each expecation + #build expectation configurations for each expectation for col in self.manifest.columns: args={} meta={} # remove trailing/leading whitespaces from manifest self.manifest.applymap(lambda x: x.strip() if isinstance(x, str) else x) - validation_rules = self.sg.get_node_validation_rules(col) + validation_rules = self.dmge.get_node_validation_rules(node_display_name=col) #check if attribute has any rules associated with it if validation_rules: @@ -383,7 +384,7 @@ def generate_errors( validation_types: Dict, errors: List, warnings: List, - sg: SchemaGenerator, + dmge: DataModelGraphExplorer, ): """ Purpose: @@ -448,7 +449,7 @@ def generate_errors( row_num = str(row+2), attribute_name = errColumn, invalid_entry = str(value), - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) @@ -464,7 +465,7 @@ def generate_errors( module_to_call = 'match', attribute_name = errColumn, invalid_entry = value, - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) @@ -476,7 +477,7 @@ def generate_errors( attribute_name = errColumn, row_num = np_array_to_str_list(np.array(indices)+2), error_val = iterable_to_str_list(values), - sg = self.sg + dmge = self.dmge ) if vr_errors: errors.append(vr_errors) diff --git a/schematic/models/metadata.py b/schematic/models/metadata.py index 15a4507b5..50e718014 100644 --- a/schematic/models/metadata.py +++ b/schematic/models/metadata.py @@ -1,24 +1,16 @@ -import json import os import logging -import string - -import numpy as np -import pandas as pd -import re import networkx as nx -from jsonschema import Draft7Validator, exceptions, validate, ValidationError, FormatError -from os.path import exists +from jsonschema import ValidationError # allows specifying explicit variable types from typing import Any, Dict, Optional, Text, List -# handle schema logic; to be refactored as SchemaExplorer matures into a package -# as collaboration with Biothings progresses - -from schematic.schemas.explorer import SchemaExplorer from schematic.manifest.generator import ManifestGenerator -from schematic.schemas.generator import SchemaGenerator +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_json_schema import DataModelJSONSchema + #TODO: This module should only be aware of the store interface # we shouldn't need to expose Synapse functionality explicitly @@ -26,10 +18,8 @@ from schematic.utils.df_utils import load_df -from schematic.models.validate_attribute import ValidateAttribute from schematic.models.validate_manifest import validate_all - logger = logging.getLogger(__name__) @@ -54,17 +44,24 @@ def __init__(self, inputMModelLocation: str, inputMModelLocationType: str,) -> N """ # extract extension of 'inputMModelLocation' # ensure that it is necessarily pointing to a '.jsonld' file - if inputMModelLocation.rpartition(".")[-1] == "jsonld": - logger.debug( - f"Initializing SchemaGenerator object from {inputMModelLocation} schema." - ) - self.inputMModelLocation = inputMModelLocation - self.sg = SchemaGenerator(inputMModelLocation) - else: - raise TypeError( - f"Please make sure {inputMModelLocation} is a .jsonld file." - ) + logger.debug( + f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema." + ) + + self.inputMModelLocation = inputMModelLocation + + data_model_parser = DataModelParser(path_to_data_model = self.inputMModelLocation) + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + self.graph_data_model = data_model_grapher.generate_data_model_graph() + + self.dmge = DataModelGraphExplorer(self.graph_data_model) # check if the type of MModel file is "local" # currently, the application only supports reading from local JSON-LD files @@ -103,7 +100,7 @@ def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str Raises: ValueError: rootNode not found in metadata model. """ - ordered_nodes = self.sg.get_descendants_by_edge_type( + ordered_nodes = self.dmge.get_descendants_by_edge_type( rootNode, relationshipType, connected=True, ordered=True ) @@ -141,6 +138,7 @@ def getModelManifest( mg = ManifestGenerator( path_to_json_ld=self.inputMModelLocation, + graph = self.graph_data_model, title=title, root=rootNode, additional_metadata=additionalMetadata, @@ -170,11 +168,11 @@ def get_component_requirements( """ # get required components for the input/source component - req_components = self.sg.get_component_requirements(source_component) + req_components = self.dmge.get_component_requirements(source_component) # retreive components as graph if as_graph: - req_components_graph = self.sg.get_component_requirements_graph( + req_components_graph = self.dmge.get_component_requirements_graph( source_component ) @@ -212,7 +210,11 @@ def validateModelManifest( # get validation schema for a given node in the data model, if the user has not provided input validation schema if not jsonSchema: - jsonSchema = self.sg.get_json_schema_requirements( + + # Instantiate Data Model Json Schema + self.data_model_js = DataModelJSONSchema(jsonld_path=self.inputMModelLocation, graph=self.graph_data_model) + + jsonSchema = self.data_model_js.get_json_validation_schema( rootNode, rootNode + "_validation" ) @@ -263,14 +265,14 @@ def validateModelManifest( os.remove("great_expectations/expectations/Manifest_test_suite.json") errors, warnings, manifest = validate_all(self, - errors=errors, - warnings=warnings, - manifest=manifest, - manifestPath=manifestPath, - sg=self.sg, - jsonSchema=jsonSchema, - restrict_rules=restrict_rules, - project_scope=project_scope, + errors=errors, + warnings=warnings, + manifest=manifest, + manifestPath=manifestPath, + dmge=self.dmge, + jsonSchema=jsonSchema, + restrict_rules=restrict_rules, + project_scope=project_scope, access_token=access_token) return errors, warnings @@ -289,7 +291,7 @@ def populateModelManifest(self, title, manifestPath: str, rootNode: str, return_ ValueError: rootNode not found in metadata model. """ mg = ManifestGenerator( - path_to_json_ld=self.inputMModelLocation, title=title, root=rootNode + path_to_data_model=self.inputMModelLocation, graph = self.graph_data_model, title=title, root=rootNode ) emptyManifestURL = mg.get_manifest() @@ -336,7 +338,7 @@ def submit_metadata_manifest( try: # check if the component ("class" in schema) passed as argument is valid (present in schema) or not - self.sg.se.is_class_in_schema(validate_component) + self.dmge.is_class_in_schema(validate_component) except: # a KeyError exception is raised when validate_component fails in the try-block above # here, we are suppressing the KeyError exception and replacing it with a more @@ -354,9 +356,9 @@ def submit_metadata_manifest( # if there are no errors in validation process if val_errors == []: # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id` - if exists(censored_manifest_path): + if os.path.exists(censored_manifest_path): censored_manifest_id = syn_store.associateMetadataWithFiles( - schemaGenerator = self.sg, + dmge = self.dmge, metadataManifestPath = censored_manifest_path, datasetId = dataset_id, manifest_record_type = manifest_record_type, @@ -367,7 +369,7 @@ def submit_metadata_manifest( restrict_maniest = True manifest_id = syn_store.associateMetadataWithFiles( - schemaGenerator = self.sg, + dmge = self.dmge, metadataManifestPath = manifest_path, datasetId = dataset_id, manifest_record_type = manifest_record_type, @@ -387,9 +389,9 @@ def submit_metadata_manifest( ) # no need to perform validation, just submit/associate the metadata manifest file - if exists(censored_manifest_path): + if os.path.exists(censored_manifest_path): censored_manifest_id = syn_store.associateMetadataWithFiles( - schemaGenerator = self.sg, + dmge = self.dmge, metadataManifestPath=censored_manifest_path, datasetId=dataset_id, manifest_record_type=manifest_record_type, @@ -400,7 +402,7 @@ def submit_metadata_manifest( restrict_maniest = True manifest_id = syn_store.associateMetadataWithFiles( - schemaGenerator = self.sg, + dmge = self.dmge, metadataManifestPath=manifest_path, datasetId=dataset_id, manifest_record_type=manifest_record_type, diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 362a86fcc..f923891ba 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -16,7 +16,8 @@ import pandas as pd from jsonschema import ValidationError -from schematic.schemas.generator import SchemaGenerator +from schematic.schemas.data_model_graph import DataModelGraphExplorer + from schematic.store.base import BaseStorage from schematic.store.synapse import SynapseStorage from schematic.utils.validate_rules_utils import validation_rule_info @@ -32,7 +33,7 @@ logger = logging.getLogger(__name__) class GenerateError: - def generate_schema_error(row_num: str, attribute_name: str, error_msg: str, invalid_entry: str, sg: SchemaGenerator,)-> List[str]: + def generate_schema_error(row_num: str, attribute_name: str, error_msg: str, invalid_entry: str, dmge: DataModelGraphExplorer,)-> List[str]: ''' Purpose: Process error messages generated from schema Input: @@ -50,7 +51,7 @@ def generate_schema_error(row_num: str, attribute_name: str, error_msg: str, inv raises = GenerateError.get_message_level( val_rule = 'schema', attribute_name = attribute_name, - sg = sg, + dmge = dmge, ) #if a message needs to be raised, get the approrpiate function to do so @@ -78,7 +79,7 @@ def generate_schema_error(row_num: str, attribute_name: str, error_msg: str, inv def generate_list_error( list_string: str, row_num: str, attribute_name: str, list_error: str, - invalid_entry:str, sg: SchemaGenerator, val_rule: str, + invalid_entry:str, dmge: DataModelGraphExplorer, val_rule: str, ) -> List[str]: """ Purpose: @@ -101,7 +102,7 @@ def generate_list_error( raises = GenerateError.get_message_level( val_rule = val_rule, attribute_name = attribute_name, - sg = sg, + dmge = dmge, ) #if a message needs to be raised, get the approrpiate function to do so @@ -138,7 +139,7 @@ def generate_regex_error( module_to_call: str, attribute_name: str, invalid_entry: str, - sg: SchemaGenerator, + dmge: DataModelGraphExplorer, ) -> List[str]: """ Purpose: @@ -162,7 +163,7 @@ def generate_regex_error( raises = GenerateError.get_message_level( val_rule = val_rule, attribute_name = attribute_name, - sg = sg, + dmge = dmge, ) #if a message needs to be raised, get the approrpiate function to do so @@ -191,7 +192,7 @@ def generate_regex_error( return error_list, warning_list def generate_type_error( - val_rule: str, row_num: str, attribute_name: str, invalid_entry:str, sg: SchemaGenerator, + val_rule: str, row_num: str, attribute_name: str, invalid_entry:str, dmge: DataModelGraphExplorer, ) -> List[str]: """ Purpose: @@ -209,12 +210,12 @@ def generate_type_error( error_list = [] warning_list = [] - + #Determine which, if any, message to raise raises = GenerateError.get_message_level( - val_rule = val_rule, + dmge = dmge, attribute_name = attribute_name, - sg = sg, + val_rule = val_rule, ) #if a message needs to be raised, get the approrpiate function to do so @@ -232,8 +233,15 @@ def generate_type_error( error_message = type_error_str error_val = invalid_entry + #TODO: not sure if this i needed (to split) + validation_rules=dmge.get_node_validation_rules(node_display_name=attribute_name) + + #TODO: Can remove when handling updated so split within graph + if validation_rules and '::' in validation_rules[0]: + validation_rules = validation_rules[0].split("::") + # If IsNA rule is being used to allow `Not Applicable` entries, do not log a message - if error_val.lower() == 'not applicable' and rule_in_rule_list('IsNA', sg.get_node_validation_rules(sg.get_node_label(attribute_name))): + if error_val.lower() == 'not applicable' and rule_in_rule_list('IsNA', validation_rules): pass else: logLevel(type_error_str) @@ -248,7 +256,7 @@ def generate_type_error( def generate_url_error( url: str, url_error: str, row_num: str, attribute_name: str, argument: str, - invalid_entry:str, sg: SchemaGenerator, val_rule: str, + invalid_entry:str, dmge: DataModelGraphExplorer, val_rule: str, ) -> List[str]: """ Purpose: @@ -282,7 +290,7 @@ def generate_url_error( raises = GenerateError.get_message_level( val_rule = val_rule, attribute_name = attribute_name, - sg = sg, + dmge = dmge, ) #if a message needs to be raised, get the approrpiate function to do so @@ -332,7 +340,7 @@ def generate_url_error( def generate_cross_warning( val_rule: str, attribute_name: str, - sg: SchemaGenerator, + dmge: DataModelGraphExplorer, matching_manifests = [], missing_manifest_ID = None, invalid_entry = None, @@ -362,7 +370,7 @@ def generate_cross_warning( raises = GenerateError.get_message_level( val_rule = val_rule, attribute_name = attribute_name, - sg = sg, + dmge = dmge, ) #if a message needs to be raised, get the approrpiate function to do so @@ -410,7 +418,7 @@ def generate_cross_warning( def generate_content_error( val_rule: str, attribute_name: str, - sg: SchemaGenerator, + dmge: DataModelGraphExplorer, row_num = None, error_val = None, ) -> (List[str], List[str]): @@ -425,7 +433,7 @@ def generate_content_error( Input: val_rule: str, defined in the schema. attribute_name: str, attribute being validated - sg: schemaGenerator object + dmge: DataModelGraphExplorer object row_num: str, row where the error was detected error_val: value duplicated @@ -444,7 +452,7 @@ def generate_content_error( raises = GenerateError.get_message_level( val_rule=val_rule, attribute_name = attribute_name, - sg = sg, + dmge = dmge, ) #if a message needs to be raised, get the approrpiate function to do so @@ -506,7 +514,7 @@ def generate_content_error( return error_list, warning_list def get_message_level( - sg: SchemaGenerator, + dmge: DataModelGraphExplorer, attribute_name: str, val_rule: str, ) -> str: @@ -522,7 +530,7 @@ def get_message_level( Input: val_rule: str, defined in the schema. - sg: schemaGenerator object + dmge: DataModelGraphExplorer object attribute_name: str, attribute being validated Returns: 'error', 'warning' or None @@ -536,16 +544,15 @@ def get_message_level( #set message level to default and change after if rule_parts[0] != 'schema': level = rule_info[rule_parts[0]]['default_message_level'] - # Parse rule for level, set to default if not specified if rule_parts[-1].lower() == 'error' or rule_parts[0] == 'schema': level = 'error' elif rule_parts[-1].lower() == 'warning': level = 'warning' - elif not sg.is_node_required(node_display_name=attribute_name): + elif not dmge.get_node_required(node_display_name=attribute_name): # If not required raise warnings to notify level = 'warning' - elif sg.is_node_required(node_display_name=attribute_name) and 'recommended' in val_rule: + elif dmge.get_node_required(node_display_name=attribute_name) and 'recommended' in val_rule: level = None return level @@ -595,7 +602,7 @@ def get_target_manifests(target_component, project_scope: List, access_token: st return synStore, target_manifest_IDs, target_dataset_IDs def list_validation( - self, val_rule: str, manifest_col: pd.core.series.Series, sg: SchemaGenerator, + self, val_rule: str, manifest_col: pd.core.series.Series, dmge: DataModelGraphExplorer, ) -> (List[List[str]], List[List[str]], pd.core.series.Series): """ Purpose: @@ -636,7 +643,7 @@ def list_validation( attribute_name=manifest_col.name, list_error=list_error, invalid_entry=manifest_col[i], - sg = sg, + dmge = dmge, val_rule = val_rule, ) if vr_errors: @@ -651,7 +658,7 @@ def list_validation( return errors, warnings, manifest_col def regex_validation( - self, val_rule: str, manifest_col: pd.core.series.Series, sg: SchemaGenerator, + self, val_rule: str, manifest_col: pd.core.series.Series, dmge: DataModelGraphExplorer, ) -> (List[List[str]], List[List[str]]): """ Purpose: @@ -661,6 +668,7 @@ def regex_validation( - val_rule: str, Validation rule - manifest_col: pd.core.series.Series, column for a given attribute in the manifest + - dmge: DataModelGraphExplorer Object Using this module requres validation rules written in the following manner: 'regex module regular expression' - regex: is an exact string specifying that the input is to be validated as a @@ -691,7 +699,10 @@ def regex_validation( errors = [] warnings = [] - validation_rules=self.sg.se.get_class_validation_rules(self.sg.se.get_class_label_from_display_name(manifest_col.name)) + + validation_rules = dmge.get_node_validation_rules(node_display_name=manifest_col.name) + if validation_rules and '::' in validation_rules[0]: + validation_rules = validation_rules[0].split("::") # Handle case where validating re's within a list. if re.search('list',"|".join(validation_rules)): if type(manifest_col[0]) == str: @@ -711,7 +722,7 @@ def regex_validation( module_to_call=reg_exp_rules[1], attribute_name=manifest_col.name, invalid_entry=manifest_col[i], - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) @@ -732,7 +743,7 @@ def regex_validation( module_to_call=reg_exp_rules[1], attribute_name=manifest_col.name, invalid_entry=manifest_col[i], - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) @@ -742,7 +753,7 @@ def regex_validation( return errors, warnings def type_validation( - self, val_rule: str, manifest_col: pd.core.series.Series, sg: SchemaGenerator, + self, val_rule: str, manifest_col: pd.core.series.Series, dmge: DataModelGraphExplorer, ) -> (List[List[str]], List[List[str]]): """ Purpose: @@ -753,6 +764,7 @@ def type_validation( 'float', 'int', 'num', 'str' - manifest_col: pd.core.series.Series, column for a given attribute in the manifest + - dmge: DataModelGraphExplorer Object Returns: -This function will return errors when the user input value does not match schema specifications. @@ -780,7 +792,7 @@ def type_validation( row_num=str(i + 2), attribute_name=manifest_col.name, invalid_entry=str(manifest_col[i]), - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) @@ -794,7 +806,7 @@ def type_validation( row_num=str(i + 2), attribute_name=manifest_col.name, invalid_entry=str(manifest_col[i]), - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) @@ -802,7 +814,7 @@ def type_validation( warnings.append(vr_warnings) return errors, warnings - def url_validation(self, val_rule: str, manifest_col: str, sg: SchemaGenerator,) -> (List[List[str]], List[List[str]]): + def url_validation(self, val_rule: str, manifest_col: str, dmge: DataModelGraphExplorer) -> (List[List[str]], List[List[str]]): """ Purpose: Validate URL's submitted for a particular attribute in a manifest. @@ -812,6 +824,7 @@ def url_validation(self, val_rule: str, manifest_col: str, sg: SchemaGenerator,) - val_rule: str, Validation rule - manifest_col: pd.core.series.Series, column for a given attribute in the manifest + - dmge: DataModelGraphExplorer Object Output: This function will return errors when the user input value does not match schema specifications. @@ -841,7 +854,7 @@ def url_validation(self, val_rule: str, manifest_col: str, sg: SchemaGenerator,) attribute_name=manifest_col.name, argument=url_args, invalid_entry=manifest_col[i], - sg = sg, + dmge = dmge, val_rule = val_rule, ) if vr_errors: @@ -869,7 +882,7 @@ def url_validation(self, val_rule: str, manifest_col: str, sg: SchemaGenerator,) attribute_name=manifest_col.name, argument=url_args, invalid_entry=manifest_col[i], - sg = sg, + dmge = dmge, val_rule = val_rule, ) if vr_errors: @@ -889,7 +902,7 @@ def url_validation(self, val_rule: str, manifest_col: str, sg: SchemaGenerator,) attribute_name=manifest_col.name, argument=arg, invalid_entry=manifest_col[i], - sg = sg, + dmge = dmge, val_rule = val_rule, ) if vr_errors: @@ -899,7 +912,7 @@ def url_validation(self, val_rule: str, manifest_col: str, sg: SchemaGenerator,) return errors, warnings def cross_validation( - self, val_rule: str, manifest_col: pd.core.series.Series, project_scope: List, sg: SchemaGenerator, access_token: str, + self, val_rule: str, manifest_col: pd.core.series.Series, project_scope: List, dmge: DataModelGraphExplorer, access_token: str, ) -> List[List[str]]: """ Purpose: @@ -909,6 +922,7 @@ def cross_validation( - val_rule: str, Validation rule - manifest_col: pd.core.series.Series, column for a given attribute in the manifest + - dmge: DataModelGraphExplorer Object Output: This function will return errors when values in the current manifest's attribute are not fully present in the correct amount of other manifests. @@ -986,7 +1000,7 @@ def cross_validation( row_num = missing_rows, attribute_name = source_attribute, invalid_entry = iterable_to_str_list(missing_values), - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) @@ -1001,7 +1015,7 @@ def cross_validation( row_num = invalid_rows, attribute_name = source_attribute, invalid_entry = iterable_to_str_list(invalid_values.squeeze()), - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) @@ -1028,7 +1042,7 @@ def cross_validation( attribute_name = source_attribute, invalid_entry = missing_values, missing_manifest_ID = missing_manifest_IDs, - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) @@ -1039,7 +1053,7 @@ def cross_validation( val_rule = val_rule, attribute_name = source_attribute, matching_manifests = present_manifest_log, - sg = sg, + dmge = dmge, ) if vr_errors: errors.append(vr_errors) diff --git a/schematic/models/validate_manifest.py b/schematic/models/validate_manifest.py index a6e365b6e..ff180998a 100644 --- a/schematic/models/validate_manifest.py +++ b/schematic/models/validate_manifest.py @@ -19,7 +19,7 @@ from urllib import error from schematic.models.validate_attribute import ValidateAttribute, GenerateError -from schematic.schemas.generator import SchemaGenerator +from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.store.synapse import SynapseStorage from schematic.models.GE_Helpers import GreatExpectationsHelpers from schematic.utils.validate_rules_utils import validation_rule_info @@ -28,11 +28,11 @@ logger = logging.getLogger(__name__) class ValidateManifest(object): - def __init__(self, errors, manifest, manifestPath, sg, jsonSchema): + def __init__(self, errors, manifest, manifestPath, dmge, jsonSchema): self.errors = errors self.manifest = manifest self.manifestPath = manifestPath - self.sg = sg + self.dmge = dmge self.jsonSchema = jsonSchema def get_multiple_types_error( @@ -62,7 +62,7 @@ def get_multiple_types_error( return ["NA", error_col, error_message, error_val] def validate_manifest_rules( - self, manifest: pd.core.frame.DataFrame, sg: SchemaGenerator, restrict_rules: bool, project_scope: List, access_token: Optional[str] = None, + self, manifest: pd.core.frame.DataFrame, dmge: DataModelGraphExplorer, restrict_rules: bool, project_scope: List, access_token: Optional[str] = None, ) -> (pd.core.frame.DataFrame, List[List[str]]): """ Purpose: @@ -72,7 +72,7 @@ def validate_manifest_rules( manifest: pd.core.frame.DataFrame imported from models/metadata.py contains metadata input from user for each attribute. - sg: SchemaGenerator + dmge: DataModelGraphExplorer initialized within models/metadata.py Returns: manifest: pd.core.frame.DataFrame @@ -129,7 +129,7 @@ def validate_manifest_rules( t_GE = perf_counter() #operations necessary to set up and run ge suite validation ge_helpers=GreatExpectationsHelpers( - sg=sg, + dmge=dmge, unimplemented_expectations=unimplemented_expectations, manifest = manifest, manifestPath = self.manifestPath, @@ -155,8 +155,7 @@ def validate_manifest_rules( ge_helpers.context.delete_checkpoint(ge_helpers.checkpoint_name) ge_helpers.context.delete_expectation_suite(ge_helpers.expectation_suite_name) - validation_results = results.list_validation_results() - + validation_results = results.list_validation_results() #parse validation results dict and generate errors errors, warnings = ge_helpers.generate_errors( @@ -164,7 +163,7 @@ def validate_manifest_rules( warnings = warnings, validation_results = validation_results, validation_types = validation_types, - sg = sg, + dmge = dmge, ) logger.debug(f"GE elapsed time {perf_counter()-t_GE}") else: @@ -176,7 +175,11 @@ def validate_manifest_rules( # remove trailing/leading whitespaces from manifest manifest.applymap(lambda x: x.strip() if isinstance(x, str) else x) - validation_rules = sg.get_node_validation_rules(col) + validation_rules = dmge.get_node_validation_rules(node_display_name=col) + + #TODO: Can remove when handling updated so split within graph + if validation_rules and '::' in validation_rules[0]: + validation_rules = validation_rules[0].split("::") # Check that attribute rules conform to limits: # no more than two rules for an attribute. @@ -204,16 +207,16 @@ def validate_manifest_rules( if validation_type == "list": vr_errors, vr_warnings, manifest_col = validation_method( - self, rule, manifest[col], sg, + self, rule, manifest[col], dmge, ) manifest[col] = manifest_col elif validation_type.lower().startswith("match"): vr_errors, vr_warnings = validation_method( - self, rule, manifest[col], project_scope, sg, access_token + self, rule, manifest[col], project_scope, dmge, access_token ) else: vr_errors, vr_warnings = validation_method( - self, rule, manifest[col], sg, + self, rule, manifest[col], dmge, ) # Check for validation rule errors and add them to other errors. if vr_errors: @@ -224,7 +227,7 @@ def validate_manifest_rules( logger.debug(f"In House validation elapsed time {perf_counter()-t_err}") return manifest, errors, warnings - def validate_manifest_values(self, manifest, jsonSchema, sg + def validate_manifest_values(self, manifest, jsonSchema, dmge, ) -> (List[List[str]], List[List[str]]): t_json_schema = perf_counter() @@ -247,7 +250,7 @@ def validate_manifest_values(self, manifest, jsonSchema, sg errorMsg = error.message[0:500] errorVal = error.instance if len(error.path) > 0 else "Wrong schema" - val_errors, val_warnings = GenerateError.generate_schema_error(row_num = errorRow, attribute_name = errorColName, error_msg = errorMsg, invalid_entry = errorVal, sg = sg) + val_errors, val_warnings = GenerateError.generate_schema_error(row_num = errorRow, attribute_name = errorColName, error_msg = errorMsg, invalid_entry = errorVal, dmge = dmge) if val_errors: errors.append(val_errors) @@ -257,15 +260,15 @@ def validate_manifest_values(self, manifest, jsonSchema, sg return errors, warnings -def validate_all(self, errors, warnings, manifest, manifestPath, sg, jsonSchema, restrict_rules, project_scope: List, access_token: str): - vm = ValidateManifest(errors, manifest, manifestPath, sg, jsonSchema) - manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules(manifest, sg, restrict_rules, project_scope, access_token) +def validate_all(self, errors, warnings, manifest, manifestPath, dmge, jsonSchema, restrict_rules, project_scope: List, access_token: str): + vm = ValidateManifest(errors, manifest, manifestPath, dmge, jsonSchema) + manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules(manifest, dmge, restrict_rules, project_scope, access_token) if vmr_errors: errors.extend(vmr_errors) if vmr_warnings: warnings.extend(vmr_warnings) - vmv_errors, vmv_warnings = vm.validate_manifest_values(manifest, jsonSchema, sg) + vmv_errors, vmv_warnings = vm.validate_manifest_values(manifest, jsonSchema, dmge) if vmv_errors: errors.extend(vmv_errors) if vmv_warnings: diff --git a/schematic/schemas/__init__.py b/schematic/schemas/__init__.py index 93df34ead..7943ef50e 100644 --- a/schematic/schemas/__init__.py +++ b/schematic/schemas/__init__.py @@ -1,3 +1,7 @@ -from schematic.schemas.explorer import SchemaExplorer -from schematic.schemas.generator import SchemaGenerator -from schematic.schemas.validator import SchemaValidator +from schematic.schemas.data_model_edges import DataModelEdges +from schematic.schemas.data_model_nodes import DataModelNodes +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_json_schema import DataModelJSONSchema +from schematic.schemas.data_model_jsonld import DataModelJsonLD +from schematic.schemas.data_model_relationships import DataModelRelationships +from schematic.schemas.data_model_validator import DataModelValidator diff --git a/schematic/schemas/commands.py b/schematic/schemas/commands.py index 13ff2e2c2..80700e2bf 100644 --- a/schematic/schemas/commands.py +++ b/schematic/schemas/commands.py @@ -4,17 +4,24 @@ import click_log import logging import sys +import time import re -from schematic.schemas.df_parser import _convert_csv_to_data_model +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_validator import DataModelValidator +from schematic.schemas.data_model_jsonld import DataModelJsonLD, convert_graph_to_jsonld + from schematic.utils.cli_utils import query_dict +from schematic.utils.schema_utils import export_schema from schematic.help import schema_commands -logger = logging.getLogger('schematic') +logger = logging.getLogger("schematic") click_log.basic_config(logger) CONTEXT_SETTINGS = dict(help_option_names=["--help", "-h"]) # help options + # invoke_without_command=True -> forces the application not to show aids before losing them with a --h @click.group(context_settings=CONTEXT_SETTINGS, invoke_without_command=True) def schema(): # use as `schematic model ...` @@ -32,14 +39,7 @@ def schema(): # use as `schematic model ...` ) @click_log.simple_verbosity_option(logger) @click.argument( - "schema_csv", type=click.Path(exists=True), metavar="", nargs=1 -) -@click.option( - "--base_schema", - "-b", - type=click.Path(exists=True), - metavar="", - help=query_dict(schema_commands, ("schema", "convert", "base_schema")), + "schema", type=click.Path(exists=True), metavar="", nargs=1 ) @click.option( "--output_jsonld", @@ -47,29 +47,86 @@ def schema(): # use as `schematic model ...` metavar="", help=query_dict(schema_commands, ("schema", "convert", "output_jsonld")), ) -def convert(schema_csv, base_schema, output_jsonld): +def convert(schema, output_jsonld): """ Running CLI to convert data model specification in CSV format to data model in JSON-LD format. + + Note: Currently, not configured to build off of base model, so removing --base_schema argument for now """ - # convert RFC to Data Model - base_se = _convert_csv_to_data_model(schema_csv, base_schema) - # output JSON-LD file alongside CSV file by default + # get the start time + st = time.time() + + # Instantiate Parser + data_model_parser = DataModelParser(schema) + + # Parse Model + logger.info("Parsing data model.") + parsed_data_model = data_model_parser.parse_model() + + # Convert parsed model to graph + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + logger.info("Generating data model graph.") + graph_data_model = data_model_grapher.generate_data_model_graph() + + # Validate generated data model. + logger.info("Validating the data model internally.") + data_model_validator = DataModelValidator(graph=graph_data_model) + data_model_errors, data_model_warnings = data_model_validator.run_checks() + + # If there are errors log them. + if data_model_errors: + for err in data_model_errors: + if isinstance(err, str): + logger.error(err) + elif isinstance(err, list): + for e in err: + logger.error(e) + + # If there are warnings log them. + if data_model_warnings: + for war in data_model_warnings: + if isinstance(war, str): + logger.warning(war) + elif isinstance(war, list): + for w in war: + logger.warning(w) + + logger.info("Converting data model to JSON-LD") + jsonld_data_model = convert_graph_to_jsonld(Graph=graph_data_model) + + # output JSON-LD file alongside CSV file by default, get path. if output_jsonld is None: - csv_no_ext = re.sub("[.]csv$", "", schema_csv) - output_jsonld = csv_no_ext + ".jsonld" + if not '.jsonld' in schema: + csv_no_ext = re.sub("[.]csv$", "", schema) + output_jsonld = csv_no_ext + ".jsonld" + else: + output_jsonld = schema logger.info( "By default, the JSON-LD output will be stored alongside the first " - f"input CSV file. In this case, it will appear here: '{output_jsonld}'. " + f"input CSV or JSON-LD file. In this case, it will appear here: '{output_jsonld}'. " "You can use the `--output_jsonld` argument to specify another file path." ) # saving updated schema.org schema try: - base_se.export_schema(output_jsonld) - click.echo(f"The Data Model was created and saved to '{output_jsonld}' location.") + export_schema(jsonld_data_model, output_jsonld) + click.echo( + f"The Data Model was created and saved to '{output_jsonld}' location." + ) except: - click.echo(f"The Data Model could not be created by using '{output_jsonld}' location. Please check your file path again") + click.echo( + f"The Data Model could not be created by using '{output_jsonld}' location. Please check your file path again" + ) + + # get the end time + et = time.time() + # get the execution time + elapsed_time = time.strftime("%M:%S", time.gmtime(et - st)) + click.echo(f"Execution time: {elapsed_time} (M:S)") diff --git a/schematic/schemas/data_model_edges.py b/schematic/schemas/data_model_edges.py new file mode 100644 index 000000000..7abbc26a8 --- /dev/null +++ b/schematic/schemas/data_model_edges.py @@ -0,0 +1,91 @@ +import networkx as nx + +from schematic.schemas.data_model_relationships import DataModelRelationships + + +class DataModelEdges: + def __init__(self): + self.dmr = DataModelRelationships() + self.data_model_relationships = self.dmr.relationships_dictionary + + def generate_edge( + self, + node: str, + all_node_dict: dict, + attr_rel_dict: dict, + edge_relationships: dict, + edge_list:list, + ) -> list[tuple[str, str, dict[str:str, str:int]]]: + """Generate an edge between a target node and relevant other nodes the data model. In short, does this current node belong to a recorded relationship in the attribute, relationshps dictionary. Go through each attribute and relationship to find where the node may be. + Args: + G, nx.MultiDiGraph: networkx graph representation of the data model, that is in the process of being fully built. At this point, all the nodes would have been added, and edges are being added per target node. + node, str: target node to look for connecting edges + all_node_dict, dict: a dictionary containing information about all nodes in the model + key: node display name + value: node attribute dict, containing attributes to attach to each node. + attr_rel_dict, dict: + {Attribute Display Name: { + Relationships: { + CSV Header: Value}}} + edge_relationships: dict, rel_key: csv_header if the key represents a value relationship. + edge_list: list(tuple), list of tuples describing the edges and the edge attributes, organized as (node_1, node_2, {key:edge_relationship_key, weight:int}) + At this point, the edge list will be in the process of being built. Adding edges from list so they will be added properly to the graph without being overwritten in the loop, and passing the Graph around more. + Returns: + edge_list: list(tuple), list of tuples describing the edges and the edge attributes, organized as (node_1, node_2, {key:edge_relationship_key, weight:int}) + At this point, the edge list will have additional edges added related to the current node. + """ + # For each attribute in the model. + for attribute_display_name, relationship in attr_rel_dict.items(): + # Get the relationships associated with the current attribute + relationships = relationship["Relationships"] + # Add edge relationships one at a time + for rel_key, csv_header in edge_relationships.items(): + # If the attribute has a relationship that matches the current edge being added + if csv_header in relationships.keys(): + # If the current node is part of that relationship and is not the current node + # Connect node to attribute as an edge. + if ( + node in relationships[csv_header] + and node != attribute_display_name + ): + # Generate weights based on relationship type. + # Weights will allow us to preserve the order of entries order in the data model in later steps. + if rel_key == "domainIncludes": + # For 'domainIncludes'/properties relationship, users do not explicitly provide a list order (like for valid values, or dependsOn) + # so we pull the order/weight from the order of the attributes. + weight = list(attr_rel_dict.keys()).index( + attribute_display_name + ) + elif type(relationships[csv_header]) == list: + # For other relationships that pull in lists of values, we can explicilty pull the weight by their order in the provided list + weight = relationships[csv_header].index(node) + else: + # For single (non list) entries, add weight of 0 + weight = 0 + # Get the edge_key for the edge relationship we are adding at this step + edge_key = self.data_model_relationships[rel_key]["edge_key"] + # Add edges, in a manner that preserves directionality + # TODO: rewrite to use edge_dir + if rel_key in ["subClassOf", "domainIncludes"]: + edge_list.append(( + all_node_dict[node]["label"], + all_node_dict[attribute_display_name]["label"], + {'key':edge_key, + 'weight':weight,}) + ) + else: + edge_list.append(( + all_node_dict[attribute_display_name]["label"], + all_node_dict[node]["label"], + {'key':edge_key, + 'weight':weight},) + ) + # Add add rangeIncludes/valid value relationships in reverse as well, making the attribute the parent of the valid value. + if rel_key == "rangeIncludes": + edge_list.append(( + all_node_dict[attribute_display_name]["label"], + all_node_dict[node]["label"], + {'key':"parentOf", + 'weight':weight},) + ) + return edge_list diff --git a/schematic/schemas/data_model_graph.py b/schematic/schemas/data_model_graph.py new file mode 100644 index 000000000..917d1eb71 --- /dev/null +++ b/schematic/schemas/data_model_graph.py @@ -0,0 +1,756 @@ +from copy import deepcopy +import graphviz +import logging +from typing import Any, Dict, Optional, Text +import networkx as nx +from rdflib import Namespace + +from schematic.schemas.data_model_edges import DataModelEdges +from schematic.schemas.data_model_nodes import DataModelNodes +from schematic.schemas.data_model_relationships import DataModelRelationships + +from schematic.utils.schema_utils import ( + get_property_label_from_display_name, + get_class_label_from_display_name, +) +from schematic.utils.general import unlist +from schematic.utils.viz_utils import visualize + +logger = logging.getLogger(__name__) + + +class DataModelGraphMeta(object): + _instances = {} + + def __call__(cls, *args, **kwargs): + """ + Possible changes to the value of the `__init__` argument do not affect + the returned instance. + """ + if cls not in cls._instances: + instance = super().__call__(*args, **kwargs) + cls._instances[cls] = instance + return cls._instances[cls] + + +class DataModelGraph: + """ + Generate graph network (networkx) from the attributes and relationships returned + from the data model parser. + + Create a singleton. + """ + + __metaclass__ = DataModelGraphMeta + + def __init__(self, attribute_relationships_dict: dict) -> None: + """Load parsed data model. + Args: + attributes_relationship_dict, dict: generated in data_model_parser + {Attribute Display Name: { + Relationships: { + CSV Header: Value}}} + Raises: + ValueError, attribute_relationship_dict not loaded. + """ + self.attribute_relationships_dict = attribute_relationships_dict + self.dmn = DataModelNodes(self.attribute_relationships_dict) + self.dme = DataModelEdges() + self.dmr = DataModelRelationships() + + if not self.attribute_relationships_dict: + raise ValueError( + "Something has gone wrong, a data model was not loaded into the DataModelGraph Class. Please check that your paths are correct" + ) + self.graph = self.generate_data_model_graph() + + def generate_data_model_graph(self) -> nx.MultiDiGraph: + """Generate NetworkX Graph from the Relationships/attributes dictionary, the graph is built by first adding all nodes to the graph, then connecting nodes by the relationships defined in the attributes_relationship dictionary. + Returns: + G: nx.MultiDiGraph, networkx graph representation of the data model + """ + # Get all relationships with edges + edge_relationships = self.dmr.retreive_rel_headers_dict(edge=True) + + # Find all nodes + all_nodes = self.dmn.gather_all_nodes_in_model( + attr_rel_dict=self.attribute_relationships_dict + ) + + # Instantiate NetworkX MultiDigraph + G = nx.MultiDiGraph() + + all_node_dict = {} + + ## Fill in MultiDigraph with nodes + for node in all_nodes: + # Gather information for each node + node_dict = self.dmn.generate_node_dict( + node, self.attribute_relationships_dict + ) + + # Add each node to the all_node_dict to be used for generating edges + all_node_dict[node] = node_dict + + # Generate node and attach information (attributes) to each node + G = self.dmn.generate_node(G, node_dict) + + edge_list = [] + ## Connect nodes via edges + for node in all_nodes: + # Generate edges + edge_list_2 = self.dme.generate_edge( + node, + all_node_dict, + self.attribute_relationships_dict, + edge_relationships, + edge_list, + ) + edge_list = edge_list_2.copy() + + # Add edges to the Graph + for node_1, node_2, edge_dict in edge_list: + G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight']) + return G + + +class DataModelGraphExplorer: + def __init__( + self, + G, + ): + """Load data model graph as a singleton. + Args: + G: nx.MultiDiGraph, networkx graph representation of the data model + """ + self.graph = G # At this point the graph is expected to be fully formed. + self.dmr = DataModelRelationships() + self.rel_dict = self.dmr.relationships_dictionary + + def find_properties(self) -> set[str]: + """Identify all properties, as defined by the first node in a pair, connected with 'domainIncludes' edge type + Returns: + properties, set: All properties defined in the data model, each property name is defined by its label. + """ + properties = [] + for node_1, node_2, rel in self.graph.edges: + if rel == self.rel_dict["domainIncludes"]["edge_key"]: + properties.append(node_1) + properties = set(properties) + return properties + + def find_classes(self) -> set[str]: + """Identify all classes, as defined but all nodes, minus all properties (which are explicitly defined) + Returns: + classes, set: All classes defined in the data model, each class name is defined by its label. + """ + nodes = self.graph.nodes + properties = self.find_properties() + classes = nodes - properties + return classes + + def find_node_range( + self, node_label: Optional[str] = None, node_display_name: Optional[str] = None + ) -> list: + """Get valid values for the given node (attribute) + Args: + node_label, str, Optional[str]: label of the node for which to retrieve valid values + node_display_name, str, Optional[str]: Display Name of the node for which to retrieve valid values + Returns: + valid_values, list: List of valid values associated with the provided node. + """ + if not node_label: + node_label = self.get_node_label(node_display_name) + + valid_values = [] + for node_1, node_2, rel in self.graph.edges: + if ( + node_1 == node_label + and rel == self.rel_dict["rangeIncludes"]["edge_key"] + ): + valid_values.append(node_2) + valid_values = list(set(valid_values)) + return valid_values + + def get_adjacent_nodes_by_relationship( + self, node_label: str, relationship: str + ) -> list[str]: + """Get a list of nodes that is / are adjacent to a given node, based on a relationship type. + + Args: + node_label: label of the the node whose edges we need to look at. + relationship: the type of link(s) that the above node and its immediate neighbors share. + + Returns: + List of nodes that are adjacent to the given node. + #checked + """ + nodes = set() + for node_1, node_2, key, _ in self.graph.out_edges( + node_label, data=True, keys=True + ): + if key == relationship: + nodes.add(node_2) + + return list(nodes) + + def get_component_requirements( + self, + source_component: str, + ) -> list[str]: + """Get all components that are associated with a given source component and are required by it. + + Args: + source_component: source component for which we need to find all required downstream components. + + Returns: + List of nodes that are descendants from the source component are are related to the source through a specific component relationship. + """ + + req_components = list( + reversed( + self.get_descendants_by_edge_type( + source_component, + self.rel_dict["requiresComponent"]["edge_key"], + ordered=True, + ) + ) + ) + + return req_components + + def get_component_requirements_graph( + self, + source_component: str, + ) -> nx.DiGraph: + """Get all components that are associated with a given source component and are required by it; return the components as a dependency graph (i.e. a DAG). + + Args: + source_component, str: source component for which we need to find all required downstream components. + + Returns: + A subgraph of the schema graph induced on nodes that are descendants from the source component and are related to the source through a specific component relationship. + """ + + # get a list of required component nodes + req_components = self.get_component_requirements(source_component) + + # get the subgraph induced on required component nodes + req_components_graph = self.get_subgraph_by_edge_type( + self.rel_dict["requiresComponent"]["edge_key"], + ).subgraph(req_components) + + return req_components_graph + + def get_descendants_by_edge_type( + self, + source_node: str, + relationship: str, + connected: bool = True, + ordered: bool = False, + ) -> list[str]: + """Get all nodes that are descendants of a given source node, based on a specific type of edge / relationship type. + + Args: + source_node: The node whose descendants need to be retreived. + relationship: Edge / link relationship type with possible values same as in above docs. + connected: If True, we need to ensure that all descendant nodes are reachable from the source node, i.e., they are part of the same connected component. + If False, the descendants could be in multiple connected components. + Default value is True. + ordered: If True, the list of descendants will be topologically ordered. + If False, the list has no particular order (depends on the order in which the descendats were traversed in the subgraph). + + Returns: + List of nodes that are descendants from a particular node (sorted / unsorted) + """ + + root_descendants = nx.descendants(self.graph, source_node) + + subgraph_nodes = list(root_descendants) + subgraph_nodes.append(source_node) + descendants_subgraph = self.graph.subgraph(subgraph_nodes) + + # prune the descendants subgraph so as to include only those edges that match the relationship type + rel_edges = [] + for node_1, node_2, key, _ in descendants_subgraph.edges(data=True, keys=True): + if key == relationship: + rel_edges.append((node_1, node_2)) + + relationship_subgraph = nx.DiGraph() + relationship_subgraph.add_edges_from(rel_edges) + + descendants = relationship_subgraph.nodes() + + if not descendants: + # return empty list if there are no nodes that are reachable from the source node based on this relationship type + return [] + + if connected and ordered: + # get the set of reachable nodes from the source node + descendants = nx.descendants(relationship_subgraph, source_node) + descendants.add(source_node) + + # normally, the descendants from a node are unordered (peculiarity of nx descendants call) + # form the subgraph on descendants and order it topologically + # this assumes an acyclic subgraph + descendants = nx.topological_sort( + relationship_subgraph.subgraph(descendants) + ) + elif connected: + # get the nodes that are reachable from a given source node + # after the pruning process above some nodes in the root_descendants subgraph might have become disconnected and will be omitted + descendants = nx.descendants(relationship_subgraph, source_node) + descendants.add(source_node) + elif ordered: + # sort the nodes topologically + # this requires the graph to be an acyclic graph + descendants = nx.topological_sort(relationship_subgraph) + + return list(descendants) + + def get_digraph_by_edge_type(self, edge_type: str) -> nx.DiGraph: + """Get a networkx digraph of the nodes connected via a given edge_type. + Args: + edge_type: + Edge type to search for, possible types are defined by 'edge_key' in relationship class + Returns: + """ + digraph = nx.DiGraph() + for node_1, node_2, key, _ in self.graph.edges(data=True, keys=True): + if key == edge_type: + digraph.add_edge(node_1, node_2) + return digraph + + def get_edges_by_relationship( + self, + node: str, + relationship: str, + ) -> list[str]: + """Get a list of out-edges of a node where the edges match a specifc type of relationship. + + i.e., the edges connecting a node to its neighbors are of relationship type -- "parentOf" (set of edges to children / sub-class nodes). + + Args: + node: the node whose edges we need to look at. + relationship: the type of link(s) that the above node and its immediate neighbors share. + + Returns: + List of edges that are connected to the node. + """ + edges = [] + + for node_1, node_2, key, _ in self.graph.out_edges(node, data=True, keys=True): + if key == relationship: + edges.append((node_1, node_2)) + + return edges + + def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]: + """Order the values associated with a particular node and edge_key to match original ordering in schema. + Args: + key: a key representing and edge relationship in DataModelRelationships.relationships_dictionary + source_node_label, str: node to look for edges of and order + Returns: + sorted_nodes, list: list of sorted nodes, that share the specified relationship with the source node + Example: + For the example data model, for key='rangeIncludes', source_node_label='CancerType' the return would be ['Breast, 'Colorectal', 'Lung', 'Prostate', 'Skin'] in that exact order. + Raises: + KeyError, cannot find source node in graph + """ + # Check if node is in the graph, if not throw an error. + if not self.is_class_in_schema(node_label=source_node_label): + raise KeyError( + f"Cannot find node: {source_node_label} in the graph, please check entry." + ) + + edge_key = self.rel_dict[key]["edge_key"] + + # Handle out edges + if self.rel_dict[key]["jsonld_direction"] == "out": + # use outedges + + original_edge_weights_dict = { + attached_node: self.graph[source_node][attached_node][edge_key][ + "weight" + ] + for source_node, attached_node in self.graph.out_edges( + source_node_label + ) + if edge_key in self.graph[source_node][attached_node] + } + # Handle in edges + else: + # use inedges + original_edge_weights_dict = { + attached_node: self.graph[attached_node][source_node][edge_key][ + "weight" + ] + for attached_node, source_node in self.graph.in_edges(source_node_label) + if edge_key in self.graph[attached_node][source_node] + } + + sorted_nodes = list( + dict( + sorted(original_edge_weights_dict.items(), key=lambda item: item[1]) + ).keys() + ) + + return sorted_nodes + + # Get values associated with a node + def get_nodes_ancestors(self, subgraph: nx.DiGraph, node_label: str) -> list[str]: + """Get a list of nodes reachable from source component in graph + Args: + subgraph: networkx graph object + node_label, str: label of node to find ancestors for + Returns: + all_ancestors, list: nodes reachable from source in graph + """ + all_ancestors = list(nx.ancestors(subgraph, node_label)) + + return all_ancestors + + def get_node_comment( + self, node_display_name: str = None, node_label: str = None + ) -> str: + """Get the node definition, i.e., the "comment" associated with a given node display name. + + Args: + node_display_name, str: Display name of the node which you want to get the comment for. + node_label, str: Label of the node you would want to get the comment for. + Returns: + Comment associated with node, as a string. + """ + if not node_label: + node_label = self.get_node_label(node_display_name) + + if not node_label: + return "" + + node_definition = self.graph.nodes[node_label][ + self.rel_dict["comment"]["node_label"] + ] + return node_definition + + def get_node_dependencies( + self, + source_node: str, + display_names: bool = True, + schema_ordered: bool = True, + ) -> list[str]: + """Get the immediate dependencies that are related to a given source node. + + Args: + source_node: The node whose dependencies we need to compute. + display_names: if True, return list of display names of each of the dependencies. + if False, return list of node labels of each of the dependencies. + schema_ordered: if True, return the dependencies of the node following the order of the schema (slower). + if False, return dependencies from graph without guaranteeing schema order (faster) + + Returns: + List of nodes that are dependent on the source node. + """ + + if schema_ordered: + # get dependencies in the same order in which they are defined in the schema + required_dependencies = self.get_ordered_entry( + key=self.rel_dict["requiresDependency"]["edge_key"], + source_node_label=source_node, + ) + else: + required_dependencies = self.get_adjacent_nodes_by_relationship( + node_label=source_node, + relationship=self.rel_dict["requiresDependency"]["edge_key"], + ) + + if display_names: + # get display names of dependencies + dependencies_display_names = [] + + for req in required_dependencies: + dependencies_display_names.append( + self.graph.nodes[req][self.rel_dict["displayName"]["node_label"]] + ) + + return dependencies_display_names + + return required_dependencies + + def get_nodes_descendants(self, node_label: str) -> list[str]: + """Return a list of nodes reachable from source in graph + Args: + node_label, str: any given node + Return: + all_descendants, list: nodes reachable from source in graph + """ + all_descendants = list(nx.descendants(self.graph, node_label)) + + return all_descendants + + def get_nodes_display_names( + self, + node_list: list[str], + ) -> list[str]: + """Get display names associated with the given list of nodes. + + Args: + node_list: List of nodes whose display names we need to retrieve. + + Returns: + List of display names. + """ + node_list_display_names = [ + self.graph.nodes[node][self.rel_dict["displayName"]["node_label"]] + for node in node_list + ] + + return node_list_display_names + + def get_node_label(self, node_display_name: str) -> str: + """Get the node label for a given display name. + + Args: + node_display_name: Display name of the node which you want to get the label for. + Returns: + Node label associated with given node. + If display name not part of schema, return an empty string. + """ + + node_class_label = get_class_label_from_display_name( + display_name=node_display_name + ) + node_property_label = get_property_label_from_display_name( + display_name=node_display_name + ) + + if node_class_label in self.graph.nodes: + node_label = node_class_label + elif node_property_label in self.graph.nodes: + node_label = node_property_label + else: + node_label = "" + + return node_label + + def get_node_range( + self, + node_label: Optional[str] = None, + node_display_name: Optional[str] = None, + display_names: bool = False, + ) -> list[str]: + """Get the range, i.e., all the valid values that are associated with a node label. + + Args: + node_label: Node for which you need to retrieve the range. + display_names, bool: True + Returns: + required_range: Returned if display_names=False, list of valid values (labels) associated with a given node. + dependencies_display_name: Returned if display_names=True, + List of valid values (display names) associated with a given node + Raises: + ValueError: If the node cannot be found in the graph. + """ + if not node_label: + node_label = self.get_node_label(node_display_name) + + try: + # get node range in the order defined in schema for given node + required_range = self.find_node_range(node_label=node_label) + except KeyError: + raise ValueError( + f"The source node {node_label} does not exist in the graph. " + "Please use a different node." + ) + + if display_names: + # get the display name(s) of all dependencies + dependencies_display_names = [] + + for req in required_range: + dependencies_display_names.append(self.graph.nodes[req]["displayName"]) + + return dependencies_display_names + + return required_range + + def get_node_required( + self, node_label: Optional[str] = None, node_display_name: Optional[str] = None + ) -> bool: + """Check if a given node is required or not. + + Note: The possible options that a node can be associated with -- "required" / "optional". + + Args: + node_label: Label of the node for which you need to look up. + node_display_name: Display name of the node for which you want look up. + Returns: + True: If the given node is a "required" node. + False: If the given node is not a "required" (i.e., an "optional") node. + """ + if not node_label: + node_label = self.get_node_label(node_display_name) + + rel_node_label = self.rel_dict["required"]["node_label"] + node_required = self.graph.nodes[node_label][rel_node_label] + return node_required + + def get_node_validation_rules( + self, node_label: Optional[str] = None, node_display_name: Optional[str] = None + ) -> str: + """Get validation rules associated with a node, + + Args: + node_label: Label of the node for which you need to look up. + node_display_name: Display name of the node which you want to get the label for. + Returns: + A set of validation rules associated with node, as a list. + """ + if not node_label: + node_label = self.get_node_label(node_display_name) + + if not node_label: + return [] + + node_validation_rules = self.graph.nodes[node_label]["validationRules"] + + return node_validation_rules + + def get_subgraph_by_edge_type(self, relationship: str) -> nx.DiGraph: + """Get a subgraph containing all edges of a given type (aka relationship). + + Args: + relationship: edge / link relationship type with possible values same as in above docs. + + Returns: + Directed graph on edges of a particular type (aka relationship) + """ + + # prune the metadata model graph so as to include only those edges that match the relationship type + rel_edges = [] + for node_1, node_2, key, _ in self.graph.out_edges(data=True, keys=True): + if key == relationship: + rel_edges.append((node_1, node_2)) + + relationship_subgraph = nx.DiGraph() + relationship_subgraph.add_edges_from(rel_edges) + + return relationship_subgraph + + def find_adjacent_child_classes( + self, node_label: Optional[str] = None, node_display_name: Optional[str] = None + ) -> list[str]: + """Find child classes of a given node. + Args: + node_display_name: Display name of the node to look up. + node_label: Label of the node to look up. + Returns: + List of nodes that are adjacent to the given node, by SubclassOf relationship. + """ + if not node_label: + node_label = self.get_node_label(node_display_name) + + return self.get_adjacent_nodes_by_relationship( + node_label=node_label, relationship=self.rel_dict["subClassOf"]["edge_key"] + ) + + def find_child_classes(self, schema_class: str) -> list: + """Find schema classes that inherit from the given class + Args: + schema_class: node label for the class to from which to look for children. + Returns: + list of children to the schema_class. + """ + return unlist(list(self.graph.successors(schema_class))) + + def find_class_specific_properties(self, schema_class: str) -> list[str]: + """Find properties specifically associated with a given class + Args: + schema_class, str: node/class label, to identify properties for. + Returns: + properties, list: List of properties associate with a given schema class. + Raises: + KeyError: Key error is raised if the provded schema_class is not in the graph + """ + + if not self.is_class_in_schema(schema_class): + raise KeyError( + f"Schema_class provided: {schema_class} is not in the data model, please check that you are providing the proper class/node label" + ) + + properties = [] + for n1, n2 in self.graph.edges(): + if n2 == schema_class and "domainValue" in self.graph[n1][schema_class]: + properties.append(n1) + return properties + + def find_parent_classes(self, node_label: str) -> list[list[str]]: + """Find all parents of the provided node + Args: + node_label: label of the node to find parents of + Returns: + List of list of Parents to the given node. + """ + # Get digraph of nodes with parents + digraph = self.get_digraph_by_edge_type("parentOf") + + # Get root node + root_node = list(nx.topological_sort(digraph))[0] + + # Get paths between root_node and the target node. + paths = nx.all_simple_paths(self.graph, source=root_node, target=node_label) + + return [_path[:-1] for _path in paths] + + def full_schema_graph(self, size: Optional[int] = None) -> graphviz.Digraph: + """Create a graph of the data model. + Args: + size, float: max height and width of the graph, if one value provided it is used for both. + Returns: + schema graph viz + """ + edges = self.graph.edges() + return visualize(edges, size=size) + + def is_class_in_schema(self, node_label: str) -> bool: + """Determine if provided node_label is in the schema graph/data model. + Args: + node_label: label of node to search for in the + Returns: + True, if node is in the graph schema + False, if node is not in graph schema + """ + if node_label in self.graph.nodes(): + return True + else: + return False + + def sub_schema_graph( + self, source: str, direction: str, size=None + ) -> Optional[graphviz.Digraph]: + """Create a sub-schema graph + Args: + source, str: source node label to start graph + direction, str: direction to create the vizualization, choose from "up", "down", "both" + size, float: max height and width of the graph, if one value provided it is used for both. + Returns: + Sub-schema graph viz + """ + if direction == "down": + edges = list(nx.edge_bfs(self.graph, [source])) + return visualize(edges, size=size) + elif direction == "up": + paths = self.find_parent_classes(source) + edges = [] + for _path in paths: + _path.append(source) + for i in range(0, len(_path) - 1): + edges.append((_path[i], _path[i + 1])) + return visualize(edges, size=size) + elif direction == "both": + paths = self.find_parent_classes(source) + edges = list(nx.edge_bfs(self.graph, [source])) + for _path in paths: + _path.append(source) + for i in range(0, len(_path) - 1): + edges.append((_path[i], _path[i + 1])) + return visualize(edges, size=size) diff --git a/schematic/schemas/data_model_json_schema.py b/schematic/schemas/data_model_json_schema.py new file mode 100644 index 000000000..844f5a4ec --- /dev/null +++ b/schematic/schemas/data_model_json_schema.py @@ -0,0 +1,383 @@ +import logging +import networkx as nx +import os +from typing import Any, Dict, Optional, Text, List + +from schematic.schemas.data_model_graph import DataModelGraphExplorer +from schematic.schemas.data_model_relationships import DataModelRelationships + +from schematic.utils.validate_utils import rule_in_rule_list + +logger = logging.getLogger(__name__) + + +class DataModelJSONSchema: + def __init__( + self, + jsonld_path: str, + graph: nx.MultiDiGraph, + ): + # TODO: Change jsonld_path to data_model_path (can work with CSV too) + self.jsonld_path = jsonld_path + self.graph = graph # Graph would be fully made at this point. + self.dmge = DataModelGraphExplorer(self.graph) + self.dmr = DataModelRelationships() + self.rel_dict = self.dmr.relationships_dictionary + + def get_array_schema( + self, node_range: List[str], node_name: str, blank=False + ) -> Dict[str, Dict[str, List[str]]]: + """Add a list of nodes to the "enum" key in a given JSON schema object. + Allow a node to be mapped to any subset of the list + + Args: + node_name: Name of the "main" / "head" key in the JSON schema / object. + node_range: List of nodes to be added to the JSON object. + blank: If True, add empty node to end of node list. + If False, do not add empty node to end of node list. + + Returns: + JSON object with array validation rule. + """ + + schema_node_range_array = { + node_name: { + "type": "array", + "items": {"enum": node_range + [""] if blank else node_range}, + "maxItems": len(node_range), + } + } + + return schema_node_range_array + + def get_non_blank_schema( + self, node_name: str + ) -> Dict[str, dict[str, Any]]: # can't define heterogenous Dict generic types + """Get a schema rule that does not allow null or empty values. + + Args: + node_name: Name of the node on which the schema rule is to be applied. + + Returns: + Schema rule as a JSON object. + """ + non_blank_schema = {node_name: {"not": {"type": "null"}, "minLength": 1}} + + return non_blank_schema + + def get_range_schema( + self, node_range: List[str], node_name: str, blank=False + ) -> Dict[str, Dict[str, List[str]]]: + """Add a list of nodes to the "enum" key in a given JSON schema object. + + Args: + node_name: Name of the "main" / "head" key in the JSON schema / object. + node_range: List of nodes to be added to the JSON object. + blank: If True, add empty node to end of node list. + If False, do not add empty node to end of node list. + + Returns: + JSON object with nodes. + """ + if blank: + schema_node_range = {node_name: {"enum": node_range + [""]}} + else: + schema_node_range = {node_name: {"enum": node_range}} + + return schema_node_range + + def get_json_validation_schema( + self, source_node: str, schema_name: str + ) -> Dict[str, dict[str, Any]]: + """ + Consolidated method that aims to gather dependencies and value constraints across terms / nodes in a schema.org schema and store them in a jsonschema /JSON Schema schema. + + It does so for any given node in the schema.org schema (recursively) using the given node as starting point in the following manner: + 1) Find all the nodes / terms this node depends on (which are required as "additional metadata" given this node is "required"). + 2) Find all the allowable metadata values / nodes that can be assigned to a particular node (if such a constraint is specified on the schema). + + Args: + source_node: Node from which we can start recursive dependancy traversal (as mentioned above). + schema_name: Name assigned to JSON-LD schema (to uniquely identify it via URI when it is hosted on the Internet). + + Returns: + JSON Schema as a dictionary. + """ + json_schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "http://example.com/" + schema_name, + "title": schema_name, + "type": "object", + "properties": {}, + "required": [], + "allOf": [], + } + + nodes_to_process = ( + [] + ) # list of nodes to be checked for dependencies, starting with the source node + processed_nodes = ( + [] + ) # keep of track of nodes whose dependencies have been processed + reverse_dependencies = ( + {} + ) # maintain a map between conditional nodes and their dependencies (reversed) -- {dependency : conditional_node} + range_domain_map = ( + {} + ) # maintain a map between range nodes and their domain nodes {range_value : domain_value} + # the domain node is very likely the parentof ("parentOf" relationship) of the range node + + root_dependencies = self.dmge.get_adjacent_nodes_by_relationship( + node_label=source_node, + relationship=self.rel_dict["requiresDependency"]["edge_key"], + ) + + # if root_dependencies is empty it means that a class with name 'source_node' exists + # in the schema, but it is not a valid component + if not root_dependencies: + raise ValueError(f"'{source_node}' is not a valid component in the schema.") + + nodes_to_process += root_dependencies + + process_node = nodes_to_process.pop(0) + + while process_node: + if not process_node in processed_nodes: + # node is being processed + node_is_processed = True + + node_range = self.dmge.get_adjacent_nodes_by_relationship( + node_label=process_node, + relationship=self.rel_dict["rangeIncludes"]["edge_key"], + ) + + # get node range display name + node_range_d = self.dmge.get_nodes_display_names(node_list=node_range) + + node_dependencies = self.dmge.get_adjacent_nodes_by_relationship( + node_label=process_node, + relationship=self.rel_dict["requiresDependency"]["edge_key"], + ) + + # get process node display name + node_display_name = self.graph.nodes[process_node][ + self.rel_dict["displayName"]["node_label"] + ] + + # updating map between node and node's valid values + for n in node_range_d: + if not n in range_domain_map: + range_domain_map[n] = [] + range_domain_map[n].append(node_display_name) + + # can this node be map to the empty set (if required no; if not required yes) + # TODO: change "required" to different term, required may be a bit misleading (i.e. is the node required in the schema) + node_required = self.dmge.get_node_required(node_label=process_node) + + # get any additional validation rules associated with this node (e.g. can this node be mapped to a list of other nodes) + node_validation_rules = self.dmge.get_node_validation_rules( + node_display_name=node_display_name + ) + + if node_display_name in reverse_dependencies: + # if node has conditionals set schema properties and conditional dependencies + # set schema properties + if node_range: + # if process node has valid value range set it in schema properties + schema_valid_vals = self.get_range_schema( + node_range=node_range_d, + node_name=node_display_name, + blank=True, + ) + + if node_validation_rules: + # if this node has extra validation rules process them + # TODO: abstract this into its own validation rule constructor/generator module/class + if rule_in_rule_list("list", node_validation_rules): + # if this node can be mapped to a list of nodes + # set its schema accordingly + schema_valid_vals = self.get_array_schema( + node_range=node_range_d, + node_name=node_display_name, + blank=True, + ) + + else: + # otherwise, by default allow any values + schema_valid_vals = {node_display_name: {}} + + json_schema["properties"].update(schema_valid_vals) + + # set schema conditional dependencies + for node in reverse_dependencies[node_display_name]: + # set all of the conditional nodes that require this process node + + # get node domain if any + # ow this node is a conditional requirement + if node in range_domain_map: + domain_nodes = range_domain_map[node] + conditional_properties = {} + + for domain_node in domain_nodes: + # set range of conditional node schema + conditional_properties.update( + { + "properties": {domain_node: {"enum": [node]}}, + "required": [domain_node], + } + ) + + # given node conditional are satisfied, this process node (which is dependent on these conditionals) has to be set or not depending on whether it is required + if node_range: + dependency_properties = self.get_range_schema( + node_range=node_range_d, + node_name=node_display_name, + blank=not node_required, + ) + + if node_validation_rules: + if rule_in_rule_list( + "list", node_validation_rules + ): + # TODO: get_range_schema and get_range_schema have similar behavior - combine in one module + dependency_properties = ( + self.get_array_schema( + node_range=node_range_d, + node_name=node_display_name, + blank=not node_required, + ) + ) + + else: + if node_required: + dependency_properties = ( + self.get_non_blank_schema( + node_name=node_display_name + ) + ) + else: + dependency_properties = {node_display_name: {}} + schema_conditional_dependencies = { + "if": conditional_properties, + "then": { + "properties": dependency_properties, + "required": [node_display_name], + }, + } + + # update conditional-dependency rules in json schema + json_schema["allOf"].append( + schema_conditional_dependencies + ) + + else: + # node doesn't have conditionals + if node_required: + if node_range: + schema_valid_vals = self.get_range_schema( + node_range=node_range_d, + node_name=node_display_name, + blank=False, + ) + + if node_validation_rules: + # If there are valid values AND they are expected to be a list, + # reformat the Valid Values. + if rule_in_rule_list("list", node_validation_rules): + schema_valid_vals = self.get_array_schema( + node_range=node_range_d, + node_name=node_display_name, + blank=False, + ) + else: + schema_valid_vals = self.get_non_blank_schema( + node_name=node_display_name + ) + + json_schema["properties"].update(schema_valid_vals) + # add node to required fields + json_schema["required"] += [node_display_name] + + elif process_node in root_dependencies: + # node doesn't have conditionals and is not required; it belongs in the schema only if it is in root's dependencies + + if node_range: + schema_valid_vals = self.get_range_schema( + node_range=node_range_d, + node_name=node_display_name, + blank=True, + ) + + if node_validation_rules: + if rule_in_rule_list("list", node_validation_rules): + schema_valid_vals = self.get_array_schema( + node_range=node_range_d, + node_name=node_display_name, + blank=True, + ) + + else: + schema_valid_vals = {node_display_name: {}} + + json_schema["properties"].update(schema_valid_vals) + + else: + # node doesn't have conditionals and it is not required and it is not a root dependency + # the node doesn't belong in the schema + # do not add to processed nodes since its conditional may be traversed at a later iteration (though unlikely for most schemas we consider) + node_is_processed = False + + # add process node as a conditional to its dependencies + node_dependencies_d = self.dmge.get_nodes_display_names( + node_list=node_dependencies + ) + + for dep in node_dependencies_d: + if not dep in reverse_dependencies: + reverse_dependencies[dep] = [] + + reverse_dependencies[dep].append(node_display_name) + + # add nodes found as dependencies and range of this processed node + # to the list of nodes to be processed + nodes_to_process += node_range + nodes_to_process += node_dependencies + + # if the node is processed add it to the processed nodes set + if node_is_processed: + processed_nodes.append(process_node) + + # if the list of nodes to process is not empty + # set the process node the next remaining node to process + if nodes_to_process: + process_node = nodes_to_process.pop(0) + else: + # no more nodes to process + # exit the loop + break + + logger.info("JSON schema successfully generated from schema.org schema!") + + # if no conditional dependencies were added we can't have an empty 'AllOf' block in the schema, so remove it + if not json_schema["allOf"]: + del json_schema["allOf"] + + # If no config value and SchemaGenerator was initialized with + # a JSON-LD path, construct + if self.jsonld_path is not None: + self.jsonld_path_root, jsonld_ext = os.path.splitext(self.jsonld_path) + prefix = self.jsonld_path_root + prefix_root, prefix_ext = os.path.splitext(prefix) + if prefix_ext == ".model": + prefix = prefix_root + json_schema_log_file = f"{prefix}.{source_node}.schema.json" + """ + # Commenting out loggins since the JSON Schema file is not currently saved. + logger.info( + "The JSON schema file can be inspected by setting the following " + "nested key in the configuration: (model > location)." + ) + + logger.info(f"JSON schema file log stored as {json_schema_log_file}") + """ + return json_schema diff --git a/schematic/schemas/data_model_jsonld.py b/schematic/schemas/data_model_jsonld.py new file mode 100644 index 000000000..fa9af86ef --- /dev/null +++ b/schematic/schemas/data_model_jsonld.py @@ -0,0 +1,461 @@ +import copy +from dataclasses import dataclass, field, asdict +from dataclasses_json import config, dataclass_json +import json +import logging + +from typing import Any, Dict, Optional, Text, List +import networkx as nx + +from schematic.schemas.data_model_graph import DataModelGraphExplorer +from schematic.schemas.data_model_relationships import DataModelRelationships +from schematic.utils.schema_utils import ( + get_label_from_display_name, + convert_bool_to_str, + strip_context, +) + +logging.basicConfig() +logger = logging.getLogger(__name__) + + +@dataclass_json +@dataclass +class BaseTemplate: + magic_context: str = field( + default_factory=lambda: { + "bts": "http://schema.biothings.io/", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "schema": "http://schema.org/", + "xsd": "http://www.w3.org/2001/XMLSchema#", + }, + metadata=config(field_name="@context"), + ) + magic_graph: str = field(default_factory=list, metadata=config(field_name="@graph")) + magic_id: str = field( + default="http://schema.biothings.io/#0.1", metadata=config(field_name="@id") + ) + + +@dataclass_json +@dataclass +class PropertyTemplate: + magic_id: str = field(default="", metadata=config(field_name="@id")) + magic_type: str = field(default="rdf:Property", metadata=config(field_name="@type")) + magic_comment: str = field(default="", metadata=config(field_name="rdfs:comment")) + magic_label: str = field(default="", metadata=config(field_name="rdfs:label")) + magic_domain_includes: list = field( + default_factory=list, metadata=config(field_name="schema:domainIncludes") + ) + magic_range_includes: list = field( + default_factory=list, metadata=config(field_name="schema:rangeIncludes") + ) + magic_isPartOf: dict = field( + default_factory=dict, metadata=config(field_name="schema:isPartOf") + ) + magic_displayName: str = field( + default="", metadata=config(field_name="sms:displayName") + ) + magic_required: str = field( + default="sms:false", metadata=config(field_name="sms:required") + ) + magic_validationRules: list = field( + default_factory=list, metadata=config(field_name="sms:validationRules") + ) + + +@dataclass_json +@dataclass +class ClassTemplate: + magic_id: str = field(default="", metadata=config(field_name="@id")) + magic_type: str = field(default="rdfs:Class", metadata=config(field_name="@type")) + magic_comment: str = field(default="", metadata=config(field_name="rdfs:comment")) + magic_label: str = field(default="", metadata=config(field_name="rdfs:label")) + magic_subClassOf: list = field( + default_factory=list, metadata=config(field_name="rdfs:subClassOf") + ) + magic_range_includes: list = field( + default_factory=list, metadata=config(field_name="schema:rangeIncludes") + ) + magic_isPartOf: dict = field( + default_factory=dict, metadata=config(field_name="schema:isPartOf") + ) + magic_displayName: str = field( + default="", metadata=config(field_name="sms:displayName") + ) + magic_required: str = field( + default="sms:false", metadata=config(field_name="sms:required") + ) + magic_requiresDependency: list = field( + default_factory=list, metadata=config(field_name="sms:requiresDependency") + ) + magic_requiresComponent: list = field( + default_factory=list, metadata=config(field_name="sms:requiresComponent") + ) + magic_validationRules: list = field( + default_factory=list, metadata=config(field_name="sms:validationRules") + ) + + +class DataModelJsonLD(object): + """ + #Interface to JSONLD_object + """ + + def __init__(self, Graph: nx.MultiDiGraph, output_path: str = ""): + # Setup + self.graph = Graph # Graph would be fully made at this point. + self.dmr = DataModelRelationships() + self.rel_dict = self.dmr.relationships_dictionary + self.dmge = DataModelGraphExplorer(self.graph) + self.output_path = output_path + + # Gather the templates + base_template = BaseTemplate() + self.base_jsonld_template = json.loads(base_template.to_json()) + + property_template = PropertyTemplate() + self.property_template = json.loads(property_template.to_json()) + + class_template = ClassTemplate() + self.class_template = json.loads(class_template.to_json()) + + def get_edges_associated_with_node( + self, node: str + ) -> List[tuple[str, str, dict[str, int]]]: + """Retrieve all edges traveling in and out of a node. + Args: + node, str: Label of node in the graph to look for assiciated edges + Returns: + node_edges, list: List of Tuples of edges associated with the given node, tuple contains the two nodes, plus the weight dict associated with the edge connection. + """ + node_edges = list(self.graph.in_edges(node, data=True)) + node_edges.extend(list(self.graph.out_edges(node, data=True))) + return node_edges + + def get_edges_associated_with_property_nodes( + self, node:str + ) -> List[tuple[str, str, dict[str, int]]]: + """Get edges associated with property nodes to make sure we add that relationship. + Args: + node, str: Label of node property in the graph to look for assiciated edges + Returns: + node_edges, list: List of Tuples of edges associated with the given node, tuple contains the two nodes, plus the weight dict associated with the edge connection. + """ + # Get edge keys for domainIncludes and subclassOf + domainIncludes_edge_key = self.rel_dict['domainIncludes']['edge_key'] + node_edges = [] + # Get dict of edges for the current property node + node_edges_dict = self.graph[node] + for node_2, edge_dict in node_edges_dict.items(): + # Look through relationships in the edge dictionary + for edge_key in edge_dict: + # If the edge is a property or subclass then add the edges to the list + if edge_key in [domainIncludes_edge_key]: + node_edges.append((node, node_2, edge_dict[edge_key])) + return node_edges + + def add_edge_rels_to_template(self, template: dict, rel_vals: dict, node: str): + """ + Args: + template, dict: single class or property JSONLD template that is in the process of being filled. + rel_vals, dict: sub relationship dict for a given relationship (contains informtion like, 'edge_rel', 'jsonld_key' etc..) + node, str: node whose edge information is presently being added to the JSONLD + Returns: + """ + # Get all edges associated with the current node + node_edges = self.get_edges_associated_with_node(node=node) + + + # For properties look for reverse relationships too + if node in self.dmge.find_properties(): + property_node_edges = self.get_edges_associated_with_property_nodes(node=node) + node_edges.extend(property_node_edges) + + # Get node pairs and weights for each edge + for node_1, node_2, weight in node_edges: + # Retrieve the relationship(s) and related info between the two nodes + node_edge_relationships = self.graph[node_1][node_2] + + # Get the relationship edge key + edge_key = rel_vals["edge_key"] + + # Check if edge_key is even one of the relationships for this node pair. + if edge_key in node_edge_relationships: + # for each relationship between the given nodes + for relationship, weight_dict in node_edge_relationships.items(): + # If the relationship defined and edge_key + if relationship == edge_key: + # TODO: rewrite to use edge_dir + domainIncludes_edge_key = self.rel_dict['domainIncludes']['edge_key'] + subclassOf_edge_key = self.rel_dict['subClassOf']['edge_key'] + if edge_key in [subclassOf_edge_key]: + if node_2 == node: + # Make sure the key is in the template (differs between properties and classes) + if rel_vals["jsonld_key"] in template.keys(): + node_1_id = {"@id": "bts:" + node_1} + # TODO Move this to a helper function to clear up. + if ( + isinstance( + template[rel_vals["jsonld_key"]], list + ) + and node_1_id + not in template[rel_vals["jsonld_key"]] + ): + template[rel_vals["jsonld_key"]].append( + node_1_id + ) + else: + template[rel_vals["jsonld_key"]] == node_1 + elif edge_key in [domainIncludes_edge_key]: + if node_1 == node: + # Make sure the key is in the template (differs between properties and classes) + if rel_vals["jsonld_key"] in template.keys(): + node_2_id = {"@id": "bts:" + node_2} + # TODO Move this to a helper function to clear up. + if ( + isinstance( + template[rel_vals["jsonld_key"]], list + ) + and node_2_id + not in template[rel_vals["jsonld_key"]] + ): + template[rel_vals["jsonld_key"]].append( + node_2_id + ) + else: + template[rel_vals["jsonld_key"]] == node_2 + else: + if node_1 == node: + # Make sure the key is in the template (differs between properties and classes) + if rel_vals["jsonld_key"] in template.keys(): + node_2_id = {"@id": "bts:" + node_2} + # TODO Move this to a helper function to clear up. + if ( + isinstance( + template[rel_vals["jsonld_key"]], list + ) + and node_2_id + not in template[rel_vals["jsonld_key"]] + ): + template[rel_vals["jsonld_key"]].append( + node_2_id + ) + else: + template[rel_vals["jsonld_key"]] == node_2 + return template + + def add_node_info_to_template(self, template, rel_vals, node): + """For a given node and relationship, add relevant value to template + Args: + template, dict: single class or property JSONLD template that is in the process of being filled. + rel_vals, dict: sub relationship dict for a given relationship (contains informtion like, 'edge_rel', 'jsonld_key' etc..) + node, str: node whose information is presently being added to the JSONLD + Returns: + template, dict: single class or property JSONLD template that is in the process of being filled, and now has had additional node information added. + """ + # Get label for relationship used in the graph + node_label = rel_vals["node_label"] + + # Get recorded info for current node, and the attribute type + node_info = nx.get_node_attributes(self.graph, node_label)[node] + + # Add this information to the template + template[rel_vals["jsonld_key"]] = node_info + return template + + def fill_entry_template(self, template: dict, node: str) -> dict: + """Fill in a blank JSONLD template with information for each node. All relationships are filled from the graph, based on the type of information (node or edge) + Args: + template, dict: empty class or property template to be filled with information for the given node. + node, str: target node to fill the template out for. + Returns: + template, dict: filled class or property template, that has been processed and cleaned up. + """ + data_model_relationships = self.dmr.relationships_dictionary + + # For each field in template fill out with information from the graph + for rel, rel_vals in data_model_relationships.items(): + key_context, key_rel = strip_context(context_value=rel_vals["jsonld_key"]) + + # Fill in the JSONLD template for this node, with data from the graph by looking up the nodes edge relationships, and the value information attached to the node. + + # Fill edge information (done per edge type) + if rel_vals["edge_rel"]: + template = self.add_edge_rels_to_template( + template=template, rel_vals=rel_vals, node=node + ) + + # Fill in node value information + else: + template = self.add_node_info_to_template( + template=template, rel_vals=rel_vals, node=node + ) + + # Clean up template + template = self.clean_template( + template=template, + data_model_relationships=data_model_relationships, + ) + + # Reorder lists based on weights: + template = self.reorder_template_entries( + template=template, + ) + # Add contexts to certain values + template = self.add_contexts_to_entries( + template=template, + ) + + return template + + def add_contexts_to_entries(self, template: dict) -> dict: + """ + Args: + template, dict: JSONLD template that has been filled up to the current node, with information + Returns: + template, dict: JSONLD template where contexts have been added back to certain values. + Note: This will likely need to be modified when Contexts are truly added to the model + """ + for jsonld_key, entry in template.items(): + # Retrieve the relationships key using the jsonld_key + rel_key = [] + + for rel, rel_vals in self.rel_dict.items(): + if "jsonld_key" in rel_vals and jsonld_key == rel_vals["jsonld_key"]: + rel_key.append(rel) + + if rel_key: + rel_key = rel_key[0] + # If the current relationship can be defined with a 'node_attr_dict' + if "node_attr_dict" in self.rel_dict[rel_key].keys(): + try: + # if possible pull standard function to get node information + rel_func = self.rel_dict[rel_key]["node_attr_dict"]["standard"] + except: + # if not pull default function to get node information + rel_func = self.rel_dict[rel_key]["node_attr_dict"]["default"] + + # Add appropritae contexts that have been removed in previous steps (for JSONLD) or did not exist to begin with (csv) + if ( + rel_key == "id" + and rel_func == get_label_from_display_name + and "bts" not in str(template[jsonld_key]).lower() + ): + template[jsonld_key] = "bts:" + template[jsonld_key] + elif ( + rel_key == "required" + and rel_func == convert_bool_to_str + and "sms" not in str(template[jsonld_key]).lower() + ): + template[jsonld_key] = ( + "sms:" + str(template[jsonld_key]).lower() + ) + + return template + + def clean_template(self, template: dict, data_model_relationships: dict) -> dict: + """Get rid of empty k:v pairs. Fill with a default if specified in the relationships dictionary. + Args: + template, dict: JSONLD template for a single entry, keys specified in property and class templates. + data_model_relationships, dict: dictionary containing information for each relationship type supported. + Returns: + template: JSONLD template where unfilled entries have been removed, or filled with default depending on specifications in the relationships dictionary. + """ + for rels in data_model_relationships.values(): + # Get the current relationships, jsonld key + relationship_jsonld_key = rels["jsonld_key"] + # Check if the relationship_relationship_key is part of the template, and if it is, look to see if it has an entry + if ( + relationship_jsonld_key in template.keys() + and not template[rels["jsonld_key"]] + ): + # If there is no value recorded, fill out the template with the default relationship value (if recorded.) + if "jsonld_default" in rels.keys(): + template[relationship_jsonld_key] = rels["jsonld_default"] + else: + # If there is no default specified in the relationships dictionary, delete the empty value from the template. + del template[relationship_jsonld_key] + return template + + def reorder_template_entries(self, template: dict) -> dict: + """In JSONLD some classes or property keys have list values. We want to make sure these lists are ordered according to the order supplied by the user. + This will look specically in lists and reorder those. + Args: + template, dict: JSONLD template for a single entry, keys specified in property and class templates. + Returns: + template, dict: list entries re-ordered to match user supplied order. + Note: + User order only matters for nodes that are also attributes + """ + template_label = template["rdfs:label"] + + for jsonld_key, entry in template.items(): + # Make sure dealing with an edge relationship: + is_edge = [ + "True" + for rel_key, rel_vals in self.rel_dict.items() + if rel_vals["jsonld_key"] == jsonld_key + if rel_vals["edge_rel"] == True + ] + + # if the entry is of type list and theres more than one value in the list attempt to reorder + if is_edge and isinstance(entry, list) and len(entry) > 1: + # Get edge key from data_model_relationships using the jsonld_key: + key, edge_key = [ + (rel_key, rel_vals["edge_key"]) + for rel_key, rel_vals in self.rel_dict.items() + if jsonld_key == rel_vals["jsonld_key"] + ][0] + + # Order edges + sorted_edges = self.dmge.get_ordered_entry( + key=key, source_node_label=template_label + ) + if not len(entry) == len(sorted_edges): + logger.error("There is an error with sorting values in the JSONLD, please issue a bug report.") + + edge_weights_dict = {edge: i for i, edge in enumerate(sorted_edges)} + ordered_edges = [0] * len(edge_weights_dict.keys()) + for edge, normalized_weight in edge_weights_dict.items(): + ordered_edges[normalized_weight] = {"@id": "bts:" + edge} + + # Throw an error if ordered_edges does not get fully filled as expected. + if 0 in ordered_edges: + logger.error( + "There was an issue getting values to match order specified in the data model, please submit a help request." + ) + template[jsonld_key] = ordered_edges + return template + + def generate_jsonld_object(self): + """Create the JSONLD object. + Returns: + jsonld_object, dict: JSONLD object containing all nodes and related information + """ + # Get properties. + properties = self.dmge.find_properties() + + # Get JSONLD Template + json_ld_template = self.base_jsonld_template + + # Iterativly add graph nodes to json_ld_template as properties or classes + for node in self.graph.nodes: + if node in properties: + # Get property template + property_template = copy.deepcopy(self.property_template) + obj = self.fill_entry_template(template=property_template, node=node) + else: + # Get class template + class_template = copy.deepcopy(self.class_template) + obj = self.fill_entry_template(template=class_template, node=node) + json_ld_template["@graph"].append(obj) + return json_ld_template + + +def convert_graph_to_jsonld(Graph): + # Make the JSONLD object + data_model_jsonld_converter = DataModelJsonLD(Graph=Graph) + jsonld_dm = data_model_jsonld_converter.generate_jsonld_object() + return jsonld_dm diff --git a/schematic/schemas/data_model_nodes.py b/schematic/schemas/data_model_nodes.py new file mode 100644 index 000000000..e82369789 --- /dev/null +++ b/schematic/schemas/data_model_nodes.py @@ -0,0 +1,274 @@ +from inspect import isfunction +import networkx as nx +from rdflib import Namespace +from typing import Any, Dict, Optional, Text, List, Callable + +from schematic.schemas.data_model_parser import DataModelJSONLDParser +from schematic.schemas.data_model_relationships import DataModelRelationships + +from schematic.utils.schema_utils import ( + get_label_from_display_name, + get_attribute_display_name_from_label, + convert_bool_to_str, + parse_validation_rules, +) +from schematic.utils.validate_rules_utils import validate_schema_rules +from schematic.schemas.curie import uri2curie, curie2uri + + +class DataModelNodes: + def __init__(self, attribute_relationships_dict): + self.namespaces = dict( + rdf=Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#") + ) + self.data_model_relationships = DataModelRelationships() + self.value_relationships = ( + self.data_model_relationships.retreive_rel_headers_dict(edge=False) + ) + self.edge_relationships_dictionary = ( + self.data_model_relationships.retreive_rel_headers_dict(edge=True) + ) + self.properties = self.get_data_model_properties( + attr_rel_dict=attribute_relationships_dict + ) + # retrieve a list of relationship types that will produce nodes. + self.node_relationships = list(self.edge_relationships_dictionary.values()) + + def gather_nodes(self, attr_info: tuple) -> list: + """Take in a tuple containing attriute name and relationship dictionary, and find all nodes defined in attribute information. + Args: + attr_info, tuple: (Display Name, Relationships Dictionary portion of attribute_relationships dictionary) + Returns: + nodes, list: nodes related to the given node (specified in attr_info). + Note: + Extracting nodes in this fashion ensures order is preserved. + """ + + # Extract attribute and relationship dictionary + attribute, relationship = attr_info + relationships = relationship["Relationships"] + + nodes = [] + if attribute not in nodes: + nodes.append(attribute) + for rel in self.node_relationships: + if rel in relationships.keys(): + nodes.extend([node for node in relationships[rel] if node is not None]) + return nodes + + def gather_all_nodes_in_model(self, attr_rel_dict: dict) -> list: + """Gather all nodes in the data model, in order. + Args: + attr_rel_dict, dict: generated in data_model_parser + {Attribute Display Name: { + Relationships: { + CSV Header: Value}}} + Returns: + all_nodes, list: List of all node display names in the data model preserving order entered. + Note: + Gathering nodes in this fashion ensures order is preserved. + """ + all_nodes = [] + for attr_info in attr_rel_dict.items(): + nodes = self.gather_nodes(attr_info=attr_info) + all_nodes.extend(nodes) + # Remove any duplicates preserving order + all_nodes = list(dict.fromkeys(all_nodes).keys()) + return all_nodes + + def get_rel_node_dict_info(self, relationship: str) -> Optional[tuple[str, dict]]: + """For each display name get defaults for nodes. + Args: + relationship, str: relationship key to match. + Returns: + rel_key, str: relationship node label + rel_node_dict, dict: node_attr_dict, from relationships dictionary for a given relationship + TODO: Move to data_model_relationships. + """ + for k, v in self.data_model_relationships.relationships_dictionary.items(): + if k == relationship: + if "node_attr_dict" in v.keys(): + rel_key = v["node_label"] + rel_node_dict = v["node_attr_dict"] + return rel_key, rel_node_dict + + def get_data_model_properties(self, attr_rel_dict: dict) -> list: + """Identify all properties defined in the data model. + Args: + attr_rel_dict, dict: + {Attribute Display Name: { + Relationships: { + CSV Header: Value}}} + Returns: + properties,list: properties defined in the data model + """ + properties = [] + for attribute, relationships in attr_rel_dict.items(): + if "Properties" in relationships["Relationships"].keys(): + properties.extend(relationships["Relationships"]["Properties"]) + properties = list(set(properties)) + return properties + + def get_entry_type(self, node_display_name: str) -> str: + """Get the entry type of the node, property or class. + Args: + node_display_name, str: display name of target node. + Returns: + entry_type, str: returns 'property' or 'class' based on data model specifications. + """ + if node_display_name in self.properties: + entry_type = "property" + else: + entry_type = "class" + return entry_type + + def run_rel_functions( + self, + rel_func: callable, + node_display_name: str = "", + key: str = "", + attr_relationships={}, + csv_header="", + entry_type="", + ): + """This function exists to centralzie handling of functions for filling out node information, makes sure all the proper parameters are passed to each function. + Args: + rel_func, callable: Function to call to get information to attach to the node + node_display_name, str: node display name + key, str: relationship key + attr_relationships, dict: relationships portion of attributes_relationships dictionary + csv_header, str: csv header + entry_type, str: 'class' or 'property' defines how + + Returns: + Outputs of specified rel_func (relationship function) + + For legacy: + elif key == 'id' and rel_func == get_label_from_display_name: + func_output = get_label_from_display_name(display_name =node_display_name, entry_type=entry_type) + """ + if rel_func == get_attribute_display_name_from_label: + return get_attribute_display_name_from_label( + node_display_name, attr_relationships + ) + + elif rel_func == parse_validation_rules: + return parse_validation_rules(attr_relationships[csv_header]) + + elif rel_func == get_label_from_display_name: + return get_label_from_display_name( + display_name=node_display_name, entry_type=entry_type + ) + + elif rel_func == convert_bool_to_str: + if type(attr_relationships[csv_header]) == str: + if attr_relationships[csv_header].lower() == "true": + return True + elif attr_relationships[csv_header].lower() == "false": + return False + + elif type(attr_relationships[csv_header]) == bool: + return attr_relationships[csv_header] + + else: + # Raise Error if the rel_func provided is not captured. + raise ValueError( + f"The function provided ({rel_func}) to define the relationship {key} is not captured in the function run_rel_functions, please update." + ) + + def generate_node_dict(self, node_display_name: str, attr_rel_dict: dict) -> dict: + """Gather information to be attached to each node. + Args: + node_display_name, str: display name for current node + attr_rel_dict, dict: generated in data_model_parser + {Attribute Display Name: { + Relationships: { + CSV Header: Value}}} + + Returns: + node_dict, dict: dictionary of relationship information about the current node + {'displayName': '', 'label': '', 'comment': 'TBD', 'required': None, 'validationRules': [], 'isPartOf': '', 'uri': ''} + Note: + If the default calls function, call that function for the default or alternate implementation. + May need to update this logic for varying function calls. (for example the current function takes in the node display name + would need to update if new function took in something else.) + """ + # Strip whitespace from node display name + node_display_name = node_display_name.strip() + + # Determine if property or class + entry_type = self.get_entry_type(node_display_name=node_display_name) + + # If the node is an attribute, find its relationships. + attr_relationships = {} + if node_display_name in attr_rel_dict.keys(): + attr_relationships = attr_rel_dict[node_display_name]["Relationships"] + + # Initialize node_dict + node_dict = {} + + # Look through relationship types that represent values (i.e. do not define edges) + for key, csv_header in self.value_relationships.items(): + # Get key and defalt values current relationship type. + rel_key, rel_node_dict = self.get_rel_node_dict_info(key) + + # If we have information to add about this particular node, get it + if csv_header in attr_relationships.keys(): + # Check if the 'standard' specifies calling a function. + if "standard" in rel_node_dict.keys() and isfunction( + rel_node_dict["standard"] + ): + # Add to node_dict The value comes from the standard function call. + node_dict.update( + { + rel_key: self.run_rel_functions( + rel_node_dict["standard"], + node_display_name=node_display_name, + key=key, + attr_relationships=attr_relationships, + csv_header=csv_header, + entry_type=entry_type, + ) + } + ) + else: + # For standard entries, get information from attr_relationship dictionary + node_dict.update({rel_key: attr_relationships[csv_header]}) + # else, add default values + else: + # Check if the default specifies calling a function. + if "default" in rel_node_dict.keys() and isfunction( + rel_node_dict["default"] + ): + node_dict.update( + { + rel_key: self.run_rel_functions( + rel_node_dict["default"], + node_display_name=node_display_name, + key=key, + attr_relationships=attr_relationships, + csv_header=csv_header, + entry_type=entry_type, + ) + } + ) + else: + # Set value to defaults. + node_dict.update({rel_key: rel_node_dict["default"]}) + + return node_dict + + def generate_node(self, G: nx.MultiDiGraph, node_dict: dict) -> nx.MultiDiGraph: + """Create a node and add it to the networkx multidigraph being built + Args: + G, nx.MultiDigraph: networkx multidigraph object, that is in the process of being fully built. + node_dict, dict: dictionary of relationship information about the current node + Returns: + G, nx.MultiDigraph: networkx multidigraph object, that has had an additional node added to it. + """ + G.add_node(node_dict["label"], **node_dict) + return G + + def edit_node(self): + """Stub for future node editor.""" + return diff --git a/schematic/schemas/data_model_parser.py b/schematic/schemas/data_model_parser.py new file mode 100644 index 000000000..a541eb3ed --- /dev/null +++ b/schematic/schemas/data_model_parser.py @@ -0,0 +1,447 @@ +import logging +import pandas as pd +import pathlib +from typing import Any, Dict, Optional, Text, List, Union + +from schematic.utils.df_utils import load_df +from schematic.utils.io_utils import load_json +from schematic.utils.schema_utils import attr_dict_template + +from schematic.schemas.data_model_relationships import DataModelRelationships + +from schematic import LOADER + +logger = logging.getLogger("Synapse storage") + + +class DataModelParser: + """ + This class takes in a path to a data model and will convert it to an + attributes:relationship dictionarythat can then be further converted into a graph data model. + Other data model types may be added in the future. + """ + + def __init__( + self, + path_to_data_model: str, + ) -> None: + """ + Args: + path_to_data_model, str: path to data model. + """ + + self.path_to_data_model = path_to_data_model + self.model_type = self.get_model_type() + self.base_schema_path = None + + def _get_base_schema_path(self, base_schema: str = None) -> str: + """Evaluate path to base schema. + + Args: + base_schema: Path to base data model. BioThings data model is loaded by default. + + Returns: + base_schema_path: Path to base schema based on provided argument. + """ + biothings_schema_path = LOADER.filename("data_models/biothings.model.jsonld") + self.base_schema_path = ( + biothings_schema_path if base_schema is None else base_schema + ) + + return self.base_schema_path + + def get_model_type(self) -> str: + """Parses the path to the data model to extract the extension and determine the data model type. + Args: + path_to_data_model, str: path to data model + Returns: + str: uppercase, data model file extension. + Note: Consider moving this to Utils. + """ + return pathlib.Path(self.path_to_data_model).suffix.replace(".", "").upper() + + def parse_base_model(self) -> Dict: + """Parse base data model that new model could be built upon. + Returns: + base_model, dict: + {Attribute Display Name: { + Relationships: { + CSV Header: Value}}} + Note: Not configured yet to successfully parse biothings. + """ + + # Determine base schema path + base_model_path = self._get_base_schema_path(self.base_schema_path) + + # Parse + jsonld_parser = DataModelJSONLDParser() + base_model = jsonld_parser.parse_jsonld_model(base_model_path) + return base_model + + def parse_model(self) -> Dict[str, dict[str, Any]]: + """Given a data model type, instantiate and call the appropriate data model parser. + Returns: + model_dict, dict: + {Attribute Display Name: { + Relationships: { + CSV Header: Value}}} + Raises: + Value Error if an incorrect model type is passed. + Note: in future will add base model parsing in this step too and extend new model off base model. + """ + # base_model = self.parse_base_model() + + # Call appropriate data model parser and return parsed model. + if self.model_type == "CSV": + csv_parser = DataModelCSVParser() + model_dict = csv_parser.parse_csv_model(self.path_to_data_model) + elif self.model_type == "JSONLD": + jsonld_parser = DataModelJSONLDParser() + model_dict = jsonld_parser.parse_jsonld_model(self.path_to_data_model) + else: + raise ValueError( + f"Schematic only accepts models of type CSV or JSONLD, you provided a model type {self.model_type}, please resubmit in the proper format." + ) + return model_dict + + +class DataModelCSVParser: + def __init__(self): + # Instantiate DataModelRelationships + self.dmr = DataModelRelationships() + # Load relationships dictionary. + self.rel_dict = self.dmr.define_data_model_relationships() + # Get edge relationships + self.edge_relationships_dictionary = self.dmr.retreive_rel_headers_dict( + edge=True + ) + # Load required csv headers + self.required_headers = self.dmr.define_required_csv_headers() + # Get the type for each value that needs to be submitted. + # using csv_headers as keys to match required_headers/relationship_types + self.rel_val_types = { + v["csv_header"]: v["type"] + for k, v in self.rel_dict.items() + if "type" in v.keys() + } + + def check_schema_definition(self, model_df: pd.DataFrame) -> bool: + """Checks if a schema definition data frame contains the right required headers. + Args: + model_df: a pandas dataframe containing schema definition; see example here: https://docs.google.com/spreadsheets/d/1J2brhqO4kpeHIkNytzlqrdIiRanXDr6KD2hqjOTC9hs/edit#gid=0 + Raises: Exception if model_df does not have the required headers. + """ + if set(self.required_headers).issubset(set(list(model_df.columns))): + logger.debug("Schema definition csv ready for processing!") + return + elif "Requires" in list(model_df.columns) or "Requires Component" in list( + model_df.columns + ): + raise ValueError( + "The input CSV schema file contains the 'Requires' and/or the 'Requires " + "Component' column headers. These columns were renamed to 'DependsOn' and " + "'DependsOn Component', respectively. Switch to the new column names." + ) + elif not set(self.required_headers).issubset(set(list(model_df.columns))): + raise ValueError( + f"Schema extension headers: {set(list(model_df.columns))} " + f"do not match required schema headers: {self.required_headers}" + ) + return + + def parse_entry(self, attr: dict, relationship: str) -> Any: + """Parse attr entry baed on type + Args: + attr, dict: single row of a csv model in dict form, where only the required headers are keys. Values are the entries under each header. + relationship, str: one of the header relationships to parse the entry of. + Returns: + parsed_rel_entry, any: parsed entry for downstream processing based on the entry type. + """ + + rel_val_type = self.rel_val_types[relationship] + # Parse entry based on type: + # If the entry should be preserved as a bool dont convert to str. + if rel_val_type == bool and type(attr[relationship]) == bool: + parsed_rel_entry = attr[relationship] + # Move strings to list if they are comma separated. Schema order is preserved, remove any empty strings added by trailing commas + elif rel_val_type == list: + parsed_rel_entry = attr[relationship].strip().split(",") + parsed_rel_entry = [r.strip() for r in parsed_rel_entry if r] + # Convert value string if dictated by rel_val_type, strip whitespace. + elif rel_val_type == str: + parsed_rel_entry = str(attr[relationship]).strip() + else: + raise ValueError( + "The value type recorded for this relationship, is not currently supported for CSV parsing. Please check with your DCC." + ) + return parsed_rel_entry + + def gather_csv_attributes_relationships( + self, model_df: pd.DataFrame + ) -> Dict[str, dict[str, Any]]: + """Parse csv into a attributes:relationshps dictionary to be used in downstream efforts. + Args: + model_df: pd.DataFrame, data model that has been loaded into pandas DataFrame. + Returns: + attr_rel_dictionary: dict, + {Attribute Display Name: { + Relationships: { + CSV Header: Value}}} + """ + # Check csv schema follows expectations. + self.check_schema_definition(model_df) + + # get attributes from Attribute column + attributes = model_df[list(self.required_headers)].to_dict("records") + + # Build attribute/relationship dictionary + relationship_types = self.required_headers + attr_rel_dictionary = {} + + for attr in attributes: + attribute_name = attr["Attribute"] + # Add attribute to dictionary + attr_rel_dictionary.update(attr_dict_template(attribute_name)) + # Fill in relationship info for each attribute. + for relationship in relationship_types: + if not pd.isnull(attr[relationship]): + parsed_rel_entry = self.parse_entry( + attr=attr, relationship=relationship + ) + attr_rel_dictionary[attribute_name]["Relationships"].update( + {relationship: parsed_rel_entry} + ) + return attr_rel_dictionary + + def parse_csv_model( + self, + path_to_data_model: str, + ): + """Load csv data model and parse into an attributes:relationships dictionary + Args: + path_to_data_model, str: path to data model + Returns: + model_dict, dict:{Attribute Display Name: { + Relationships: { + CSV Header: Value}}} + """ + # Load the csv data model to DF + model_df = load_df(path_to_data_model, data_model=True) + + # Gather info from the model + model_dict = self.gather_csv_attributes_relationships(model_df) + + return model_dict + + +class DataModelJSONLDParser: + def __init__( + self, + ): + # Instantiate DataModelRelationships + self.dmr = DataModelRelationships() + # Load relationships dictionary. + self.rel_dict = self.dmr.define_data_model_relationships() + + def parse_entry(self, rel_entry: any, id_jsonld_key: str, dn_label_dict:dict[str:str], model_jsonld:dict) -> Any: + """Parse an input entry based on certain attributes + Args: + rel_entry: Given a single entry and relationship in a JSONLD data model, the recorded value + id_jsonld_key, str: the jsonld key for id + Returns: + parsed_rel_entry: an entry that has been parsed base on its input type and characteristics. + """ + # Retrieve ID from single value dictionary + if type(rel_entry) == dict and len(rel_entry.keys()) == 1: + parsed_rel_entry = rel_entry["@id"] + # Parse list of dictionaries to make a list of entries with context stripped (will update this section when contexts added.) + elif type(rel_entry) == list and type(rel_entry[0]) == dict: + parsed_rel_entry = self.convert_entry_to_dn_label([r[id_jsonld_key].split(":")[1] for r in rel_entry], model_jsonld) + # Strip context from string and convert true/false to bool + elif type(rel_entry) == str: + # Remove contexts and treat strings as appropriate. + if ":" in rel_entry and "http:" not in rel_entry: + parsed_rel_entry = rel_entry.split(":")[1] + # Convert true/false strings to boolean + if parsed_rel_entry.lower() == "true": + parsed_rel_entry = True + elif parsed_rel_entry.lower == "false": + parsed_rel_entry = False + else: + parsed_rel_entry=self.convert_entry_to_dn_label(rel_entry, model_jsonld) + + # For anything else get that + else: + parsed_rel_entry=self.convert_entry_to_dn_label(rel_entry, model_jsonld) + + return parsed_rel_entry + + def label_to_dn_dict(self, model_jsonld: list[dict]): + """ Generate a dictionary of labels to display name, so can easily look up display names using the label. + Args: + model_jsonld: list of dictionaries, each dictionary is an entry in the jsonld data model + Returns: + dn_label_dict: dict of model labels to display names + """ + jsonld_keys_to_extract = ["label", "displayName"] + label_jsonld_key, dn_jsonld_key = [ + self.rel_dict[key]["jsonld_key"] for key in jsonld_keys_to_extract + ] + dn_label_dict = {} + for entry in model_jsonld: + dn_label_dict[entry[label_jsonld_key]]=entry[dn_jsonld_key] + return dn_label_dict + + def convert_entry_to_dn_label(self, parsed_rel_entry:Union[str,list], model_jsonld:list[dict]) -> Union[str,list]: + """Convert a parsed entry to display name, taking into account the entry type + Args: + parsed_rel_entry: an entry that has been parsed base on its input type and characteristics. + model_jsonld: list of dictionaries, each dictionary is an entry in the jsonld data model + Returns: + parsed_rel_entry: an entry that has been parsed based on its input type and characteristics, and converted to display names. + """ + # Get a dictionary of display_names mapped to labels + dn_label_dict = self.label_to_dn_dict(model_jsonld=model_jsonld) + # Handle if using the display name as the label + if type(parsed_rel_entry) == list: + parsed_rel_entry = [dn_label_dict.get(entry) if dn_label_dict.get(entry) else entry for entry in parsed_rel_entry ] + elif type(parsed_rel_entry) == str: + converted_label = dn_label_dict.get(parsed_rel_entry) + if converted_label: + parsed_rel_entry = dn_label_dict.get(parsed_rel_entry) + return parsed_rel_entry + + def gather_jsonld_attributes_relationships(self, model_jsonld: List[dict]) -> Dict: + """ + Args: + model_jsonld: list of dictionaries, each dictionary is an entry in the jsonld data model + Returns: + attr_rel_dictionary: dict, + {Node Display Name: + {Relationships: { + CSV Header: Value}}} + Notes: + - Unlike a CSV the JSONLD might already have a base schema attached to it. + So the attributes:relationship dictionary for importing a CSV vs JSONLD may not match. + - It is also just about impossible to extract attributes explicitly. Using a dictionary should avoid duplications. + - This is a promiscuous capture and will create an attribute for each model entry. + - Currently only designed to capture the same information that would be encoded in CSV, + can be updated in the future. + TODO: + - Find a way to delete non-attribute keys, is there a way to reliable distinguish after the fact? + - Right now, here we are stripping contexts, will need to track them in the future. + """ + + # Retrieve relevant JSONLD keys. + jsonld_keys_to_extract = ["label", "subClassOf", "id", "displayName"] + label_jsonld_key, subclassof_jsonld_key, id_jsonld_key, dn_jsonld_key = [ + self.rel_dict[key]["jsonld_key"] for key in jsonld_keys_to_extract + ] + + # Get a dictionary of display names to labels to identify values explicitly recorded + dn_label_dict = self.label_to_dn_dict(model_jsonld=model_jsonld) + + # Build the attr_rel_dictionary + attr_rel_dictionary = {} + # Move through each entry in the jsonld model + for entry in model_jsonld: + # Get the attr key for the dictionary + if dn_jsonld_key in entry: + # The attr_key is the entry display name if one was recorded + attr_key = entry[dn_jsonld_key] + else: + # If not we wil use the get the label. + attr_key = entry[label_jsonld_key] + + # If the entry has not already been added to the dictionary, add it. + if attr_key not in attr_rel_dictionary.keys(): + attr_rel_dictionary.update(attr_dict_template(attr_key)) + + # Add relationships for each entry + # Go through each defined relationship type (rel_key) and its attributes (rel_vals) + for rel_key, rel_vals in self.rel_dict.items(): + # Determine if current entry in the for loop, can be described by the current relationship that is being cycled through. + # used to also check "csv_header" in rel_vals.keys() which allows all JSONLD values through even if it does not have a CSV counterpart, will allow other values thorough in the else statement now + if ( + rel_vals["jsonld_key"] in entry.keys() + and rel_vals["csv_header"] + ): + # Retrieve entry value associated with the given relationship + rel_entry = entry[rel_vals["jsonld_key"]] + # If there is an entry parse it by type and add to the attr:relationships dictionary. + if rel_entry: + parsed_rel_entry = self.parse_entry( + rel_entry=rel_entry, id_jsonld_key=id_jsonld_key, dn_label_dict=dn_label_dict, model_jsonld=model_jsonld, + ) + rel_csv_header = self.rel_dict[rel_key]["csv_header"] + if rel_key == 'domainIncludes': + # In the JSONLD the domain includes field contains the ids of attributes that the current attribute is the property/parent of. + # Because of this we need to handle these values differently. + # We will get the values in the field (parsed_val), then add the current attribute as to the property key in the attr_rel_dictionary[p_attr_key]. + for parsed_val in parsed_rel_entry: + attr_in_dict = False + #Get propert/parent key (displayName) + p_attr_key='' + # Check if the parsed value is already a part of the attr_rel_dictionary + for attr_dn, rels in attr_rel_dictionary.items(): + if parsed_val == attr_dn: + p_attr_key = attr_dn + attr_in_dict = True + # If it is part of the dictionary update add current attribute as a property of the parsed value + if attr_in_dict == True: + if not rel_csv_header in attr_rel_dictionary[p_attr_key]["Relationships"]: + attr_rel_dictionary[p_attr_key]["Relationships"].update({rel_csv_header:[entry[dn_jsonld_key]]}) + else: + attr_rel_dictionary[p_attr_key]["Relationships"].update({rel_csv_header:[entry[dn_jsonld_key]]}) + # If the parsed_val is not already recorded in the dictionary, add it + elif attr_in_dict == False: + # Get the display name for the parsed value + p_attr_key = self.convert_entry_to_dn_label(parsed_val, model_jsonld) + + attr_rel_dictionary.update(attr_dict_template(p_attr_key)) + attr_rel_dictionary[p_attr_key]["Relationships"].update({rel_csv_header:[entry[label_jsonld_key]]}) + + else: + attr_rel_dictionary[attr_key]["Relationships"].update( + {rel_csv_header: parsed_rel_entry} + ) + + elif ( + rel_vals["jsonld_key"] in entry.keys() + and not rel_vals["csv_header"] + ): + # Retrieve entry value associated with the given relationship + rel_entry = entry[rel_vals["jsonld_key"]] + # If there is an entry parset it by type and add to the attr:relationships dictionary. + if rel_entry: + parsed_rel_entry = self.parse_entry( + rel_entry=rel_entry, id_jsonld_key=id_jsonld_key, dn_label_dict=dn_label_dict, model_jsonld=model_jsonld, + ) + # Add relationships for each attribute and relationship to the dictionary + attr_rel_dictionary[attr_key]["Relationships"].update( + {rel_key: parsed_rel_entry} + ) + return attr_rel_dictionary + + def parse_jsonld_model( + self, + path_to_data_model: str, + ): + """Convert raw JSONLD data model to attributes relationship dictionary. + Args: + path_to_data_model: str, path to JSONLD data model + Returns: + model_dict: dict, + {Node Display Name: + {Relationships: { + CSV Header: Value}}} + """ + # Log warning that JSONLD parsing is in beta mode. + logger.warning( + "JSONLD parsing is in Beta Mode. Please inspect outputs carefully and report any errors." + ) + # Load the json_ld model to df + json_load = load_json(path_to_data_model) + # Convert dataframe to attributes relationship dictionary. + model_dict = self.gather_jsonld_attributes_relationships(json_load["@graph"]) + return model_dict diff --git a/schematic/schemas/data_model_relationships.py b/schematic/schemas/data_model_relationships.py new file mode 100644 index 000000000..9be7952dc --- /dev/null +++ b/schematic/schemas/data_model_relationships.py @@ -0,0 +1,223 @@ +from typing import Dict +from schematic.utils.schema_utils import ( + get_label_from_display_name, + get_attribute_display_name_from_label, + convert_bool_to_str, + parse_validation_rules, +) +from schematic.schemas.curie import uri2curie, curie2uri + + +class DataModelRelationships: + def __init__(self) -> None: + self.relationships_dictionary = self.define_data_model_relationships() + + def define_data_model_relationships(self) -> Dict: + """Define the relationships and their attributes so they can be accessed through other classes. + The key is how it the relationship will be referenced througout Schematic. + Note: Though we could use other keys to determine which keys define nodes and edges, + edge_rel is used as an explicit definition, for easier code readablity. + key: + jsonld_key: Name for relationship in the JSONLD. + Include in all sub-dictionaries. + csv_header: Str, name for this relationshp in the CSV data model. + Enter None if not part of the CSV data model. + node_label: Name for relationship in the graph representation of the data model. + Do not include this key for edge relationships. + type: type, type of expected to be read into graph creation. + edge_rel: True, if this relationship defines an edge + False, if is a value relationship + Include in all sub-dictionaries. + required_header: True, if relationship header is required for the csv + jsonld_default: Defines default values to fill for JSONLD generation. + Used during func DataModelJsonLD.clean_template(), to fill value with a default, if not supplied in the data model. + node_attr_dict: This is used to add information to nodes in the model. Only include for nodes not edges. + set default values for this relationship + key is the node relationship name, value is the default value. + If want to set default as a function create a nested dictionary. + {'default': default_function, + 'standard': alternative function to call if relationship is present for a node} + } + If adding new functions to node_dict will + need to modify data_model_nodes.generate_node_dict in + edge_dir: str, 'in'/'out' is the edge an in or out edge. Define for edge relationships + jsonld_dir: str, 'in'/out is the direction in or out in the JSONLD. + + TODO: + - Use class inheritance to set up + """ + map_data_model_relationships = { + "displayName": { + "jsonld_key": "sms:displayName", + "csv_header": "Attribute", + "node_label": "displayName", + "type": str, + "edge_rel": False, + "required_header": True, + "node_attr_dict": { + "default": get_attribute_display_name_from_label, + "standard": get_attribute_display_name_from_label, + }, + }, + "label": { + "jsonld_key": "rdfs:label", + "csv_header": None, + "node_label": "label", + "type": str, + "edge_rel": False, + "required_header": False, + "node_attr_dict": { + "default": get_label_from_display_name, + "standard": get_label_from_display_name, + }, + }, + "comment": { + "jsonld_key": "rdfs:comment", + "csv_header": "Description", + "node_label": "comment", + "type": str, + "edge_rel": False, + "required_header": True, + "node_attr_dict": {"default": "TBD"}, + }, + "rangeIncludes": { + "jsonld_key": "schema:rangeIncludes", + "csv_header": "Valid Values", + "edge_key": "rangeValue", + "jsonld_direction": "out", + "edge_dir": "out", + "type": list, + "edge_rel": True, + "required_header": True, + }, + "requiresDependency": { + "jsonld_key": "sms:requiresDependency", + "csv_header": "DependsOn", + "edge_key": "requiresDependency", + "jsonld_direction": "out", + "edge_dir": "out", + "type": list, + "edge_rel": True, + "required_header": True, + }, + "requiresComponent": { + "jsonld_key": "sms:requiresComponent", + "csv_header": "DependsOn Component", + "edge_key": "requiresComponent", + "jsonld_direction": "out", + "edge_dir": "out", + "type": list, + "edge_rel": True, + "required_header": True, + }, + "required": { + "jsonld_key": "sms:required", + "csv_header": "Required", + "node_label": "required", + "type": bool, + "jsonld_default": "sms:false", + "edge_rel": False, + "required_header": True, + "node_attr_dict": { + "default": False, + "standard": convert_bool_to_str, + }, + }, + "subClassOf": { + "jsonld_key": "rdfs:subClassOf", + "csv_header": "Parent", + "edge_key": "parentOf", + "jsonld_direction": "in", + "edge_dir": "out", + "jsonld_default": [{"@id": "bts:Thing"}], + "type": list, + "edge_rel": True, + "required_header": True, + }, + "validationRules": { + "jsonld_key": "sms:validationRules", + "csv_header": "Validation Rules", + "node_label": "validationRules", + "jsonld_direction": "out", + "edge_dir": "out", + "jsonld_default": [], + "type": list, + "edge_rel": False, + "required_header": True, + "node_attr_dict": { + "default": [], + "standard": parse_validation_rules, + }, + }, + "domainIncludes": { + "jsonld_key": "schema:domainIncludes", + "csv_header": "Properties", + "edge_key": "domainValue", + "jsonld_direction": "out", + "edge_dir": "in", + "type": list, + "edge_rel": True, + "required_header": True, + }, + "isPartOf": { + "jsonld_key": "schema:isPartOf", + "csv_header": None, + "node_label": "isPartOf", + "type": dict, + "edge_rel": False, + "required_header": False, + "node_attr_dict": { + "default": {"@id": "http://schema.biothings.io"}, + }, + }, + "id": { + "jsonld_key": "@id", + "csv_header": "Source", + "node_label": "uri", + "type": str, + "edge_rel": False, + "required_header": True, + "node_attr_dict": { + "default": get_label_from_display_name, + "standard": get_label_from_display_name, + }, + }, + } + + return map_data_model_relationships + + def define_required_csv_headers(self): + """Helper function to retrieve required CSV headers, alert if required header was not provided. + Returns: + required_headers: lst, Required CSV headers. + """ + required_headers = [] + for k, v in self.relationships_dictionary.items(): + try: + if v["required_header"]: + required_headers.append(v["csv_header"]) + except KeyError: + print( + f"Did not provide a 'required_header' key, value pair for the nested dictionary {k} : {key}" + ) + + return required_headers + + def retreive_rel_headers_dict(self, edge: bool) -> Dict[str, str]: + """Helper function to retrieve CSV headers for edge and non-edge relationships defined by edge_type. + Args: + edge, bool: True if looking for edge relationships + Returns: + rel_headers_dict: dict, key: csv_header if the key represents an edge relationship. + """ + rel_headers_dict = {} + for rel, rel_dict in self.relationships_dictionary.items(): + if "edge_rel" in rel_dict: + if rel_dict["edge_rel"] and edge: + rel_headers_dict.update({rel: rel_dict["csv_header"]}) + elif rel_dict["edge_rel"] == False and edge == False: + rel_headers_dict.update({rel: rel_dict["csv_header"]}) + else: + raise ValueError(f"Did not provide a 'edge_rel' for relationship {rel}") + + return rel_headers_dict diff --git a/schematic/schemas/data_model_validator.py b/schematic/schemas/data_model_validator.py new file mode 100644 index 000000000..e3d626882 --- /dev/null +++ b/schematic/schemas/data_model_validator.py @@ -0,0 +1,173 @@ +import logging +import multiprocessing +import networkx as nx +import time +from typing import Any, Dict, Optional, Text, List, Tuple + +from schematic.schemas.data_model_relationships import DataModelRelationships +logger = logging.getLogger(__name__) + +class DataModelValidator: + """ + Check for consistency within data model. + """ + + def __init__( + self, + graph: nx.MultiDiGraph, + ): + """ + Args: + graph, nx.MultiDiGraph: Graph representation of the data model. + TODO: put blacklisted chars and reserved_names in some global space where they can be accessed centrally + """ + self.graph = graph + self.DMR = DataModelRelationships() + # Define blacklisted characters, taken from store.synapse + self.blacklisted_chars = ["(", ")", ".", "-"] + # Define reserved_names, taken from Documentation + self.reserved_names = {"entityId"} + + def run_checks(self) -> Tuple[list, list]: + """Run all validation checks on the data model graph. + Returns, tuple(list, list): Returns a tuple of errors and warnings generated. + TODO: In future could design a way for groups to customize tests run for their groups, run additional tests, or move some to issuing only warnings, vice versa. + """ + error_checks = [ + self.check_graph_has_required_node_fields(), + self.check_is_dag(), + self.check_reserved_names(), + ] + warning_checks = [ + self.check_blacklisted_characters(), + ] + errors = [error for error in error_checks if error] + warnings = [warning for warning in warning_checks if warning] + return errors, warnings + + def check_graph_has_required_node_fields(self) -> List[str]: + """Checks that the graph has the required node fields for all nodes. + Returns: + error, list: List of error messages for each missing field. + """ + # Get all the fields that should be recorded per node + rel_dict = self.DMR.relationships_dictionary + node_fields = [] + for k, v in rel_dict.items(): + if "node_label" in v.keys(): + node_fields.append(v["node_label"]) + + error = [] + missing_fields = [] + # Check that required fields are present for each node. + for node, node_dict in self.graph.nodes(data=True): + missing_fields.extend( + [(node, f) for f in node_fields if f not in node_dict.keys()] + ) + + if missing_fields: + for mf in missing_fields: + error.append( + f"For entry: {mf[0]}, the required field {mf[1]} is missing in the data model graph, please double check your model and generate the graph again." + ) + return error + + def run_cycles(self, graph): + cycles = nx.simple_cycles(self.graph) + if cycles: + for cycle in cycles: + logger.warning( + f"Schematic requires models be a directed acyclic graph (DAG). Your graph is not a DAG, we found a loop between: {cycle[0]} and {cycle[1]}, please remove this loop from your model and submit again." + ) + + def check_is_dag(self) -> List[str]: + """Check that generated graph is a directed acyclic graph + Returns: + error, list: List of error messages if graph is not a DAG. List will include a message for each cycle found, if not there is a more generic message for the graph as a whole. + """ + error = [] + if not nx.is_directed_acyclic_graph(self.graph): + cycles = multiprocessing.Process(target=self.run_cycles, name="Get Cycles", args=(self.graph,)) + cycles.start() + + # Give up to 5 seconds to find cycles, if not exit and issue standard error + time.sleep(5) + + # If thread is active + if cycles.is_alive(): + # Terminate foo + cycles.terminate() + # Cleanup + cycles.join() + + error.append( + f"Schematic requires models be a directed acyclic graph (DAG). Please inspect your model." + ) + + return error + + def check_blacklisted_characters(self) -> List[str]: + """We strip these characters in store, so not sure if it matter if we have them now, maybe add warning + Returns: + warning, list: list of warnings for each node in the graph, that has a Display name that contains blacklisted characters. + """ + warning = [] + for node, node_dict in self.graph.nodes(data=True): + if any( + bl_char in node_dict["displayName"] + for bl_char in self.blacklisted_chars + ): + node_display_name = node_dict["displayName"] + blacklisted_characters_found = [ + bl_char + for bl_char in self.blacklisted_chars + if bl_char in node_dict["displayName"] + ] + blacklisted_characters_str = ",".join(blacklisted_characters_found) + warning.append( + f"Node: {node_display_name} contains a blacklisted character(s): {blacklisted_characters_str}, they will be striped if used in Synapse annotations." + ) + return warning + + def check_reserved_names(self) -> List[str]: + """Identify if any names nodes in the data model graph are the same as reserved name. + Returns: + error, list: List of erros for every node in the graph whose name overlaps with the reserved names. + """ + error = [] + reserved_names_found = [ + (name, node) + for node in self.graph.nodes + for name in self.reserved_names + if name.lower() == node.lower() + ] + if reserved_names_found: + for reserved_name, node_name in reserved_names_found: + error.append( + f"Your data model entry name: {node_name} overlaps with the reserved name: {reserved_name}. Please change this name in your data model." + ) + return error + + def check_namespace_overlap(self): + """ + Check if name is repeated. + Implement in the future + """ + warning = [] + return warning + + def check_for_orphan_attributes(self): + """ + Check if attribute is specified but not connected to another attribute or component. + Implement in future + """ + warning = [] + return warning + + def check_namespace_similarity(self): + """ + Using AI, check if submitted attributes or valid values are similar to other ones, warn users. + Implement in future + """ + warning = [] + return warning diff --git a/schematic/schemas/df_parser.py b/schematic/schemas/df_parser.py deleted file mode 100644 index a2eaceb36..000000000 --- a/schematic/schemas/df_parser.py +++ /dev/null @@ -1,787 +0,0 @@ -import os -import string -import re -import io -import requests -import logging - -from typing import ( - Any, - Dict, - Optional, - Text, -) # allows specifying explicit variable types - -import pandas as pd -import numpy as np - -from schematic.schemas.explorer import SchemaExplorer -from schematic import LOADER - -from schematic.utils.validate_rules_utils import validate_schema_rules -from schematic.utils.df_utils import load_df - -logger = logging.getLogger(__name__) - - -""" -Utility for converting csv file containing a data model definition schema (see scRNA-seq.csv for an example) into schema.org schema. -""" - -# required headers for schema; may or may not abstract further; for now hardcode -required_headers = set( - [ - "Attribute", - "Description", - "Valid Values", - "DependsOn", - "Required", - "Parent", - "Properties", - "DependsOn Component", - "Source", - "Validation Rules", - ] -) - - -def get_class( - se: SchemaExplorer, - class_display_name: str, - description: str = None, - subclass_of: list = [], - requires_dependencies: list = None, - requires_range: list = None, - requires_components: list = None, - required: bool = None, - validation_rules: list = None, -) -> dict: - - """Constructs a new schema.org compliant class given a set of schema object attributes - - Args: - se: a schema explorer object allowing the traversal and modification of a schema graph - display_class_name: human readable label for the schema object/attribute: key characteristic X of the assay, related protocol, or downstream data that we want to record as metadata feature - description: definition or a reference containing the definition of attribute X. Preferably provide a source ontology link or code in addition to the definition. - subclass_of: *schema* label of this attribute/object's parent node in the schema - requires_dependencies: important characteristics, if any, of attribute X that need to be recorded as metadata features given attribute X is specified. These characteristics are attributes themselves and need to pre-exist in the schema as such - requires_range: a set/range of values that this attribute can be assigned to. this domain is stored in the rangeIncludes property of this object. - requires_components: a set of associated components/categories that this object/entity requires for its full specification; each component is a high level ontology class in which entities/objects are categorized/componentized and it is an entity on its own that needs to exist in the schema. - required: indicates if this attribute is required or optional in a schema - validation_rules: a list of validation rules defined for this class (e.g. defining what is a valid object of this class) - - Returns: a json schema.org object - """ - - class_name = se.get_class_label_from_display_name(class_display_name) - - # setup biothings object template with mandatory elements - class_attributes = { - "@id": "bts:" + class_name, - "@type": "rdfs:Class", - "rdfs:comment": description - if description and not pd.isnull(description) - else "TBD", - "rdfs:label": class_name, - "schema:isPartOf": {"@id": "http://schema.biothings.io"}, - } - - # determine parent class of element and add subclass relationship to schema - required by biothings - # if no subclass is provided, set a default to schema.org Thing - if subclass_of: - if len(subclass_of) == 1 and pd.isnull(subclass_of[0]): - parent = {"rdfs:subClassOf": [{"@id": "schema:Thing"}]} - else: - parent = { - "rdfs:subClassOf": [ - {"@id": "bts:" + se.get_class_label_from_display_name(sub)} - for sub in subclass_of - ] - } - else: - parent = {"rdfs:subClassOf": [{"@id": "schema:Thing"}]} - - class_attributes.update(parent) - - # add optional attribute specifying attributes/objects that are required for the specification of this object - # useful for specifying annotation requirements, for example - if requires_dependencies: - requirement = { - "sms:requiresDependency": [ - {"@id": "bts:" + dep} for dep in requires_dependencies - ] - } - class_attributes.update(requirement) - - # add optional attribute specifying the possible values this object can be set to; can be other objects, including primitives - if requires_range: - value_constraint = { - "schema:rangeIncludes": [ - {"@id": "bts:" + se.get_class_label_from_display_name(val)} - for val in requires_range - ] - } - class_attributes.update(value_constraint) - - # add optional attribute specifying validation patterns associated with this object (e.g. precise definition of the object range) - if validation_rules: - class_attributes.update({"sms:validationRules": validation_rules}) - else: - class_attributes.update({"sms:validationRules": []}) - - # add optional attribute specifying the required components (i.e. high level ontology class in which entities/objects are categorized/componentized) - # that are required for the specification of this object - if requires_components: - requirement = { - "sms:requiresComponent": [{"@id": "bts:" + c} for c in requires_components] - } - class_attributes.update(requirement) - - if required: - class_attributes.update({"sms:required": "sms:true"}) - else: - class_attributes.update({"sms:required": "sms:false"}) - - # ensure display name does not contain leading/trailing white spaces - class_attributes.update({"sms:displayName": class_display_name.strip()}) - - return class_attributes - - -def get_property( - se: SchemaExplorer, - property_display_name: str, - property_class_names: list, - description: str = None, - requires_range: list = None, - requires_dependencies: list = None, - required: bool = None, - validation_rules: str = None, -) -> dict: - - """Constructs a new schema.org compliant property of an existing schema.org object/class; note that the property itself is a schema.org object class. - - Args: - se: a schema explorer object allowing the traversal and modification of a schema graph - property_display_name: human readable label for the schema object/attribute: key characteristic X of the assay, related protocol, or downstream data that we want to record as metadata feature - property_class_name: *schema* label of the classes/objects that this is a property of - description: definition or a reference containing the definition of attribute X. Preferably provide a source ontology link or code in addition to the definition. - requires_range: what is the set/domain of values that this attribute can be assigned to; currently only used to specify primitive types. TODO: extend to reg exp patterns - requires_dependencies: important characteristics, if any, of property X that need to be recorded as metadata features given property X is specified. These characteristics are attributes themselves and need to pre-exist in the schema as such - validation_rules: a list of validation rules defined for this class (e.g. defining what is a valid object of this property) - - - Returns: a json schema.org property object - """ - property_name = se.get_property_label_from_display_name(property_display_name) - - property_attributes = { - "@id": "bts:" + property_name, - "@type": "rdf:Property", - "rdfs:comment": description - if description and not pd.isnull(description) - else "TBD", - "rdfs:label": property_name, - "sms:displayName": property_display_name, - "schema:isPartOf": {"@id": "http://schema.biothings.io"}, - } - - domain_includes = { - "schema:domainIncludes": [ - {"@id": "bts:" + se.get_class_label_from_display_name(val)} - for val in property_class_names - ] - } - property_attributes.update(domain_includes) - - if requires_range: - value_constraint = { - "schema:rangeIncludes": [ - {"@id": "bts:" + se.get_class_label_from_display_name(val)} - for val in requires_range - ] - } - property_attributes.update(value_constraint) - - if requires_dependencies: - requirement = { - "sms:requiresDependency": [ - {"@id": "bts:" + dep} for dep in requires_dependencies - ] - } - property_attributes.update(requirement) - - # add optional attribute specifying validation patterns associated with this object (e.g. precise definition of the object range) - if validation_rules: - property_attributes.update({"sms:validationRules": validation_rules}) - else: - property_attributes.update({"sms:validationRules": []}) - - if required: - property_attributes.update({"sms:required": "sms:true"}) - else: - property_attributes.update({"sms:required": "sms:false"}) - - #'http://schema.org/domainIncludes':{'@id': 'bts:' + property_class_name}, - #'http://schema.org/rangeIncludes':{'@id': 'schema:' + allowed_values}, - - # ensure display name does not contain leading/trailing white spaces - property_attributes.update({"sms:displayName": property_display_name.strip()}) - - return property_attributes - - -def attribute_exists(se: SchemaExplorer, attribute_label: str) -> bool: - - """Check if a given attribute exists already in schema - - Args: - se: a schema explorer object allowing the traversal and modification of a schema graph - attribute_label: a schema label for the attribute to check - - Returns: - True/False indicating if attribute exists or not - """ - schema_graph = se.get_nx_schema() - - if attribute_label in schema_graph.nodes: - return True - return False - - -def check_schema_definition(schema_definition: pd.DataFrame) -> bool: - - """Checks if a schema definition data frame contains the right required headers. - - See schema definition guide for more details - TODO: post and link schema definition guide - - Args: - schema_definition: a pandas dataframe containing schema definition; see example here: https://docs.google.com/spreadsheets/d/1J2brhqO4kpeHIkNytzlqrdIiRanXDr6KD2hqjOTC9hs/edit#gid=0 - Raises: Exception - """ - - if required_headers.issubset(set(list(schema_definition.columns))): - return - elif "Requires" in list(schema_definition.columns) or "Requires Component" in list( - schema_definition.columns - ): - raise ValueError( - "The input CSV schema file contains the 'Requires' and/or the 'Requires " - "Component' column headers. These columns were renamed to 'DependsOn' and " - "'DependsOn Component', respectively. Switch to the new column names." - ) - -def _prop_2_classes(properties: dict) -> dict: - - """Create a dictionary linking all properties to their classes. - Args: - properties (dict): attributes and their properties (if applicable) - Returns: - Dictionary linking properties to all the classes in their domain. - """ - prop_2_classes = {} - for record in properties: - if not pd.isnull(record["Properties"]): - props = record["Properties"].strip().split(",") - for pr in props: - prop_2_classes.setdefault(pr.strip(),[]).append(record["Attribute"]) - - return prop_2_classes - -def create_nx_schema_objects( - schema_extension: pd.DataFrame, se: SchemaExplorer -) -> SchemaExplorer: - """Creates classes for all attributes and adds them to the schema. - Args: - schema_extension: a pandas dataframe containing schema definition; see example here: https://docs.google.com/spreadsheets/d/1J2brhqO4kpeHIkNytzlqrdIiRanXDr6KD2hqjOTC9hs/edit#gid=0 - se: a schema explorer object allowing the traversal and modification of a schema graph - base_schema_path: a path to a json-ld file containing an existing schema - Returns: - An updated schema explorer object - """ - - try: - check_schema_definition(schema_extension) - logger.debug("Schema definition csv ready for processing!") - except: - raise ValueError( - f"Schema extension headers: {set(list(schema_extension.columns))} " - f"do not match required schema headers: {required_headers}" - ) - - rel_dict = { - "rdfs:subClassOf": {"parentOf": "in"}, - "schema:domainIncludes": {"domainValue": "in"}, - "sms:requiresDependency": {"requiresDependency": "out"}, - "sms:requiresComponent": {"requiresComponent": "out"}, - "schema:rangeIncludes": {"rangeValue": "out"}, - } - - # get attributes from Attribute column - attributes = schema_extension[list(required_headers)].to_dict("records") - - # get all properties across all attributes from Properties column - props = set(schema_extension[["Properties"]].dropna().values.flatten()) - - # clean properties strings - all_properties = [] - for prop in props: - all_properties += [p.strip() for p in prop.split(",")] - - # get both attributes and their properties (if any) - properties = schema_extension[["Attribute", "Properties"]].to_dict("records") - - prop_2_classes = _prop_2_classes(properties) - - logger.debug("Adding attributes") - for attribute in attributes: - - required = None - if not pd.isnull(attribute["Required"]): - required = attribute["Required"] - - if not attribute["Attribute"] in all_properties: - # Attribute is not a property - display_name = attribute["Attribute"] - - subclass_of = None - if not pd.isnull(attribute["Parent"]): - subclass_of = [ - parent for parent in attribute["Parent"].strip().split(",") - ] - - new_class = get_class( - se, - display_name, - description=attribute["Description"], - subclass_of=subclass_of, - required=required, - ) - - se.add_schema_object_nx(new_class, **rel_dict) - - """ - print(se.get_nx_schema().nodes[new_class["rdfs:label"]]) - # check if attribute doesn't already exist and add it - if not attribute_exists(se, new_class["rdfs:label"]): - se.add_schema_object_nx(new_class, **rel_dict) - else: - print("ATTRIBUTE EXISTS") - print(new_class) - """ - - else: - # Attribute is a property - display_name = attribute["Attribute"] - - new_property = get_property( - se, - display_name, - prop_2_classes[display_name], - description=attribute["Description"], - required=required, - ) - - # check if attribute doesn't already exist and add it - if not attribute_exists(se, new_property["rdfs:label"]): - se.add_schema_object_nx(new_property, **rel_dict) - - logger.debug("Done adding attributes") - - # TODO check if schema already contains property - may require property context in csv schema definition - - logger.debug("Adding and editing properties") - - for prop in properties: - if not pd.isnull(prop["Properties"]): # a class may have or not have properties - for p in ( - prop["Properties"].strip().split(",") - ): # a class may have multiple properties - attribute = prop["Attribute"] - - # check if property is already present as attribute under attributes column - # TODO: adjust logic below to compactify code - p = p.strip() - if p in list(schema_extension["Attribute"]): - description = schema_extension.loc[ - schema_extension["Attribute"] == p - ]["Description"].values[0] - property_info = se.explore_property( - se.get_property_label_from_display_name(p) - ) - range_values = ( - property_info["range"] if "range" in property_info else None - ) - requires_dependencies = ( - property_info["dependencies"] - if "dependencies" in property_info - else None - ) - required = ( - property_info["required"] - if "required" in property_info - else None - ) - - new_property = get_property( - se, - p, - property_info["domain"], - description=description, - requires_range=range_values, - requires_dependencies=requires_dependencies, - required=required, - ) - se.edit_schema_object_nx(new_property) - else: - description = None - new_property = get_property( - se, p, attribute, description=description - ) - se.add_schema_object_nx(new_property, **rel_dict) - - logger.debug("Done adding properties") - - # # set range values and dependency requirements for each attribute - # # if not already added, add each attribute in required values and dependencies to the schema extension - # print("Editing attributes and properties to add requirements and value ranges") - # print("====================================================================================") - - for attribute in attributes: - - # TODO: refactor processing of multi-valued cells in columns and corresponding schema updates; it would compactify code below if class and property are encapsulated as objects inheriting from a common attribute parent object - - # get values in range for this attribute, if any are specified - range_values = attribute["Valid Values"] - if not pd.isnull(range_values): - # prepare the range values list and split based on appropriate delimiter - # if the string "range_values" starts with double quotes, then extract all "valid values" within double quotes - range_values_list = [] - if range_values[0] == '"': - range_values_list = re.findall(r'"([^"]*)"', range_values) - else: - range_values_list = range_values.strip().split(",") - - for val in range_values_list: - # check if value is in attributes column; add it as a class if not - if not val.strip() in list(schema_extension["Attribute"]): - - # determine parent class of the new value class - # if this attribute is not a property, set it as a parent class - if not attribute["Attribute"] in all_properties: - parent = [attribute["Attribute"]] - else: - # this attribute is a property, set the parent to the domain class of this attribute - - parent = se.get_class_by_property(attribute["Attribute"]) - - if not parent: - raise ValueError( - f"Listed valid value: {val}, for attribute: {attribute['Attribute']} " - "must have a class parent. The extension could not be added to the schema." - ) - new_class = get_class( - se, val, description=None, subclass_of=parent - ) - - # check if attribute doesn't already exist and add it - if not attribute_exists(se, new_class["rdfs:label"]): - se.add_schema_object_nx(new_class, **rel_dict) - - # update rangeIncludes of attribute - # if attribute is not a property, then assume it is a class - if not attribute["Attribute"] in all_properties: - class_info = se.explore_class( - se.get_class_label_from_display_name(attribute["Attribute"]) - ) - class_info["range"].append( - se.get_class_label_from_display_name(val) - ) - - class_range_edit = get_class( - se, - attribute["Attribute"], - description=attribute["Description"], - subclass_of=[attribute["Parent"]], - requires_dependencies=class_info["dependencies"], - requires_range=class_info["range"], - required=class_info["required"], - validation_rules=class_info["validation_rules"], - ) - se.edit_schema_object_nx(class_range_edit) - - else: - # the attribute is a property - property_info = se.explore_property( - se.get_property_label_from_display_name(attribute["Attribute"]) - ) - property_info["range"].append( - se.get_class_label_from_display_name(val) - ) - - property_range_edit = get_property( - se, - attribute["Attribute"], - property_info["domain"], - description=property_info["description"], - requires_dependencies=property_info["dependencies"], - requires_range=property_info["range"], - required=property_info["required"], - validation_rules=property_info["validation_rules"], - ) - se.edit_schema_object_nx(property_range_edit) - - logger.debug(val + " added to value range") - - # get validation rules for this attribute, if any are specified - validation_rules = attribute["Validation Rules"] - - if not pd.isnull(validation_rules): - - # TODO: make validation rules delimiter configurable parameter - - validation_rules = [ - val_rule.strip() for val_rule in validation_rules.strip().split("::") - ] - - validate_vr = validate_schema_rules( - validation_rules, - attribute["Attribute"], - input_filetype = 'csv_schema') - - - # update validation rules of attribute - # if attribute is not a property, then assume it is a class - if not attribute["Attribute"] in all_properties: - class_info = se.explore_class( - se.get_class_label_from_display_name(attribute["Attribute"]) - ) - class_info["validation_rules"] = validation_rules - class_val_rule_edit = get_class( - se, - attribute["Attribute"], - description=attribute["Description"], - subclass_of=[attribute["Parent"]], - requires_dependencies=class_info["dependencies"], - requires_range=class_info["range"], - required=class_info["required"], - validation_rules=class_info["validation_rules"], - ) - se.edit_schema_object_nx(class_val_rule_edit) - else: - # the attribute is a property - property_info = se.explore_property( - se.get_property_label_from_display_name(attribute["Attribute"]) - ) - property_info["validation_rules"] = validation_rules - property_val_rule_edit = get_property( - se, - attribute["Attribute"], - property_info["domain"], - description=property_info["description"], - requires_dependencies=property_info["dependencies"], - requires_range=property_info["range"], - required=property_info["required"], - validation_rules=property_info["validation_rules"], - ) - se.edit_schema_object_nx(property_val_rule_edit) - try: - logger.debug(val + "validation rules added") - except: - logger.debug("Validation rules added") - - # get dependencies for this attribute, if any are specified - requires_dependencies = attribute["DependsOn"] - - if not pd.isnull(requires_dependencies): - for dep in requires_dependencies.strip().split(","): - # check if dependency is a property or not - dep = dep.strip() - dep_is_property = dep in all_properties - dep_label = "" - # set dependency label based on kind of dependency: class or property - if dep_is_property: - dep_label = se.get_property_label_from_display_name(dep) - else: - dep_label = se.get_class_label_from_display_name(dep) - - # check if dependency is in attributes column; add it to the list if not - if not dep.strip() in list(schema_extension["Attribute"]): - # if dependency is a property create a new property; else create a new class - if not dep_is_property: - # if this attribute is not a property, set it as a parent class - if not attribute["Attribute"] in all_properties: - parent = attribute["Attribute"] - else: - # this attribute is a property, set the parent to the domain class of this attribute - parent = se.get_class_by_property(attribute["Attribute"]) - if not parent: - raise ValueError( - f"Listed required dependency: {dep}, for attribute: {attribute['Attribute']} " - "must have a class parent. The extension could not be added to the schema." - ) - - new_class = get_class( - se, dep, description=None, subclass_of=[parent] - ) - # se.add_schema_object_nx(new_class, **rel_dict) - # check if attribute doesn't already exist and add it - if not attribute_exists(se, new_class["rdfs:label"]): - se.add_schema_object_nx(new_class, **rel_dict) - - else: - if not attribute["Attribute"] in all_properties: - domain_attribute = attribute["Attribute"] - else: - # this attribute is a property, set the domain of this property to the domain class of the attribute - domain_attribute = se.get_class_by_property( - attribute["Attribute"] - ) - if not domain_attribute: - raise ValueError( - f"Listed required dependency: {dep}, must have a class parent. " - "The extension could not be added to the schema." - ) - - description = None - new_property = get_property( - se, dep, domain_attribute, description=description - ) - # check if attribute doesn't already exist and add it - if not attribute_exists(se, new_property["rdfs:label"]): - se.add_schema_object_nx(new_property, **rel_dict) - - # update required dependencies of attribute - # if attribute is not a property then assume it is a class - if not attribute["Attribute"] in all_properties: - class_info = se.explore_class( - se.get_class_label_from_display_name(attribute["Attribute"]) - ) - class_info["dependencies"].append(dep_label) - class_dependencies_edit = get_class( - se, - attribute["Attribute"], - description=attribute["Description"], - subclass_of=[attribute["Parent"]], - requires_dependencies=class_info["dependencies"], - requires_range=class_info["range"], - required=class_info["required"], - validation_rules=class_info["validation_rules"], - ) - se.edit_schema_object_nx(class_dependencies_edit) - else: - # the attribute is a property then update as a property - property_info = se.explore_property( - se.get_property_label_from_display_name(attribute["Attribute"]) - ) - property_info["dependencies"].append(dep_label) - property_dependencies_edit = get_property( - se, - attribute["Attribute"], - property_info["domain"], - description=property_info["description"], - requires_dependencies=property_info["dependencies"], - requires_range=property_info["range"], - required=property_info["required"], - validation_rules=property_info["validation_rules"], - ) - se.edit_schema_object_nx(property_dependencies_edit) - - logger.debug(dep + " added to dependencies") - - # TODO check for cycles in attribute dependencies schema subgraph - - # check if the attribute requires any components - if not pd.isnull(attribute["DependsOn Component"]): - component_dependencies = attribute["DependsOn Component"] - else: - continue - - # iterate over potentially multiple dependency components - for comp_dep in component_dependencies.strip().split(","): - - # check if a component is already defined as an attribute; if not define it in the schema - if not comp_dep.strip() in list(schema_extension["Attribute"]): - - # component is not in csv schema so try adding it as a class with a parent Thing - new_class = get_class(se, comp_dep, description=None) - - # check if attribute doesn't already exist in schema.org schema and add it - # (component may not be in csv schema, but could be in the base schema we are extending) - if not attribute_exists(se, new_class["rdfs:label"]): - se.add_schema_object_nx(new_class, **rel_dict) - - # update this attribute requirements to include component - class_info = se.explore_class( - se.get_class_label_from_display_name(attribute["Attribute"]) - ) - class_info["component_dependencies"].append( - se.get_class_label_from_display_name(comp_dep) - ) - class_component_dependencies_edit = get_class( - se, - attribute["Attribute"], - description=class_info["description"], - subclass_of=class_info["subClassOf"], - requires_dependencies=class_info["dependencies"], - requires_range=class_info["range"], - validation_rules=class_info["validation_rules"], - requires_components=class_info["component_dependencies"], - ) - se.edit_schema_object_nx(class_component_dependencies_edit) - - logger.debug(comp_dep + " added to dependencies") - - # TODO check for cycles in component dependencies schema subgraph - - logger.info("Done adding requirements and value ranges to attributes") - - return se - - -def _get_base_schema_path(base_schema: str = None) -> str: - """Evaluate path to base schema. - - Args: - base_schema: Path to base data model. BioThings data model is loaded by default. - - Returns: - base_schema_path: Path to base schema based on provided argument. - """ - biothings_schema_path = LOADER.filename("data_models/biothings.model.jsonld") - base_schema_path = biothings_schema_path if base_schema is None else base_schema - - return base_schema_path - - -def _convert_csv_to_data_model( - schema_csv: str, base_schema: str = None -) -> SchemaExplorer: - """Convert provided CSV spec. in CSV format to data model in JSON-LD format. - - Args: - schema_csv: Path to CSV file containing data to be translated to - JSON-LD data model. Can be path to local CSV or URL. - - Returns: - base_se: SchemaExplorer object which has updated properties - (base_se.schema and base_se.schema_nx). - """ - # create data model from provided RFC - rfc_df = load_df(schema_csv, data_model=True) - - # instantiate schema explorer - base_se = SchemaExplorer() - - # determine base schema path - base_schema_path = _get_base_schema_path(base_schema) - - # load base schema (BioThings) - base_se.load_schema(base_schema_path) - - # call parser code that converts a dataframe of the RFC - # specs. into a JSON-LD data model - base_se = create_nx_schema_objects(rfc_df, base_se) - - return base_se \ No newline at end of file diff --git a/schematic/schemas/explorer.py b/schematic/schemas/explorer.py deleted file mode 100644 index 44e24b2ae..000000000 --- a/schematic/schemas/explorer.py +++ /dev/null @@ -1,1041 +0,0 @@ -import os -import string -import json -import logging - -from typing import Any, Dict, Optional, Text, List - -import inflection -import networkx as nx - -from rdflib import Graph, Namespace, plugin, query -from networkx.algorithms.cycles import find_cycle -from networkx.readwrite import json_graph - -from schematic.utils.curie_utils import ( - expand_curies_in_schema, - uri2label, - extract_name_from_uri_or_curie, -) -from schematic.utils.general import find_duplicates -from schematic.utils.io_utils import load_default, load_json, load_schemaorg -from schematic.utils.schema_utils import ( - load_schema_into_networkx, - node_attrs_cleanup, - class_to_node, - relationship_edges, -) -from schematic.utils.general import dict2list, unlist -from schematic.utils.viz_utils import visualize -from schematic.utils.validate_utils import ( - validate_class_schema, - validate_property_schema, - validate_schema, -) -from schematic.schemas.curie import uri2curie, curie2uri - -namespaces = dict(rdf=Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")) - - -logger = logging.getLogger(__name__) - - -class SchemaExplorer: - """Class for exploring schema""" - - def __init__(self): - self.load_default_schema() - - def load_schema(self, schema): - """Load schema and convert it to networkx graph""" - self.schema = load_json(schema) - self.schema_nx = load_schema_into_networkx(self.schema) - - def export_schema(self, file_path): - with open(file_path, "w",encoding="utf8") as f: - json.dump(self.schema, f, sort_keys=True, indent=4, ensure_ascii=False) - - def load_default_schema(self): - """Load default schema, either schema.org or biothings""" - self.schema = load_default() - self.schema_nx = load_schema_into_networkx(self.schema) - - def get_nx_schema(self): - return self.schema_nx - - def get_edges_by_relationship( - self, class_label: str, relationship: str - ) -> List[str]: - """Get a list of out-edges of a node where the edges match a specifc type of relationship. - - i.e., the edges connecting a node to its neighbors are of relationship type -- "parentOf" (set of edges to children / sub-class nodes). - Note: possible edge relationships are -- parentOf, rangeValue, requiresDependency. - - Args: - node: the node whose edges we need to look at. - relationship: the type of link(s) that the above node and its immediate neighbors share. - - Returns: - List of edges that are connected to the node. - """ - edges = [] - - mm_graph = self.get_nx_schema() - - for (u, v, key, c) in mm_graph.out_edges(node, data=True, keys=True): - if key == relationship: - edges.append((u, v)) - - return edges - - def get_descendants_by_edge_type( - self, - source_node: str, - relationship: str, - connected: bool = True, - ordered: bool = False, - ) -> List[str]: - """Get all nodes that are descendants of a given source node, based on a specific type of edge / relationship type. - - Args: - source_node: The node whose descendants need to be retreived. - relationship: Edge / link relationship type with possible values same as in above docs. - connected: If True, we need to ensure that all descendant nodes are reachable from the source node, i.e., they are part of the same connected component. - If False, the descendants could be in multiple connected components. - Default value is True. - ordered: If True, the list of descendants will be topologically ordered. - If False, the list has no particular order (depends on the order in which the descendats were traversed in the subgraph). - - Returns: - List of nodes that are descendants from a particular node (sorted / unsorted) - """ - mm_graph = self.get_nx_schema() - - # if mm_graph.has_node(source_node): - # get all nodes that are reachable from a specified root /source node in the data model - - root_descendants = nx.descendants(mm_graph, source_node) - # else: - # print("The specified source node could not be found im the Networkx graph.") - # return [] - - subgraph_nodes = list(root_descendants) - subgraph_nodes.append(source_node) - descendants_subgraph = mm_graph.subgraph(subgraph_nodes) - - # prune the descendants subgraph so as to include only those edges that match the relationship type - rel_edges = [] - for (u, v, key, c) in descendants_subgraph.edges(data=True, keys=True): - if key == relationship: - rel_edges.append((u, v)) - - relationship_subgraph = nx.DiGraph() - relationship_subgraph.add_edges_from(rel_edges) - - descendants = relationship_subgraph.nodes() - - if not descendants: - # return empty list if there are no nodes that are reachable from the source node based on this relationship type - return [] - - if connected and ordered: - # get the set of reachable nodes from the source node - descendants = nx.descendants(relationship_subgraph, source_node) - descendants.add(source_node) - - # normally, the descendants from a node are unordered (peculiarity of nx descendants call) - # form the subgraph on descendants and order it topologically - # this assumes an acyclic subgraph - descendants = nx.topological_sort( - relationship_subgraph.subgraph(descendants) - ) - elif connected: - # get the nodes that are reachable from a given source node - # after the pruning process above some nodes in the root_descendants subgraph might have become disconnected and will be omitted - descendants = nx.descendants(relationship_subgraph, source_node) - descendants.add(source_node) - elif ordered: - # sort the nodes topologically - # this requires the graph to be an acyclic graph - descendants = nx.topological_sort(relationship_subgraph) - - return list(descendants) - - def get_adjacent_nodes_by_relationship( - self, node: str, relationship: str - ) -> List[str]: - """Get a list of nodes that is / are adjacent to a given node, based on a relationship type. - - Args: - node: the node whose edges we need to look at. - relationship: the type of link(s) that the above node and its immediate neighbors share. - - Returns: - List of nodes that are adjacent to the given node. - """ - nodes = set() - - mm_graph = self.get_nx_schema() - - for (u, v, key, c) in mm_graph.out_edges(node, data=True, keys=True): - if key == relationship: - nodes.add(v) - - return list(nodes) - - def is_class_in_schema(self, class_label): - if class_label in self.schema_nx.nodes(): - return True - else: - return False - - def full_schema_graph(self, size=None): - edges = self.schema_nx.edges() - return visualize(edges, size=size) - - def sub_schema_graph(self, source, direction, size=None): - if direction == "down": - edges = list(nx.edge_bfs(self.schema_nx, [source])) - return visualize(edges, size=size) - elif direction == "up": - paths = self.find_parent_classes(source) - edges = [] - for _path in paths: - _path.append(source) - for i in range(0, len(_path) - 1): - edges.append((_path[i], _path[i + 1])) - return visualize(edges, size=size) - elif direction == "both": - paths = self.find_parent_classes(source) - edges = list(nx.edge_bfs(self.schema_nx, [source])) - for _path in paths: - _path.append(source) - for i in range(0, len(_path) - 1): - edges.append((_path[i], _path[i + 1])) - return visualize(edges, size=size) - - def find_parent_classes(self, schema_class): - """Find all parents of the class""" - - digraph = self.get_digraph_by_edge_type("parentOf") - - root_node = list(nx.topological_sort(digraph))[0] - # root_node = list(nx.topological_sort(self.schema_nx))[0] - - paths = nx.all_simple_paths( - self.schema_nx, source=root_node, target=schema_class - ) - # print(root_node) - return [_path[:-1] for _path in paths] - - def find_class_specific_properties(self, schema_class): - """Find properties specifically associated with a given class""" - schema_uri = self.schema_nx.nodes[schema_class]["uri"] - properties = [] - for record in self.schema["@graph"]: - if record["@type"] == "rdf:Property": - if ( - type(record["schema:domainIncludes"]) == dict - and record["schema:domainIncludes"]["@id"] == schema_uri - ): - properties.append(record["rdfs:label"]) - elif ( - type(record["schema:domainIncludes"]) == list - and [ - item - for item in record["schema:domainIncludes"] - if item["@id"] == schema_uri - ] - != [] - ): - - properties.append(record["rdfs:label"]) - return properties - - def find_all_class_properties(self, schema_class, display_as_table=False): - """Find all properties associated with a given class - # TODO : need to deal with recursive paths - """ - parents = self.find_parent_classes(schema_class) - # print(schema_class) - # print(parents) - properties = [ - { - "class": schema_class, - "properties": self.find_class_specific_properties(schema_class), - } - ] - for path in parents: - path.reverse() - for _parent in path: - # print(_parent) - properties.append( - { - "class": _parent, - "properties": self.find_class_specific_properties(_parent), - } - ) - if not display_as_table: - return properties - else: - content = [["Property", "Expected Type", "Description", "Class"]] - for record in properties: - for _property in record["properties"]: - property_info = self.explore_property(_property) - if "range" in property_info: - content.append( - [ - _property, - property_info["range"], - property_info["description"], - record["class"], - ] - ) - else: - content.append( - [_property, property_info["description"], record["class"]] - ) - - # TODO: Log content - - def find_class_usages(self, schema_class): - """Find where a given class is used as a value of a property""" - usages = [] - schema_uri = self.schema_nx.nodes[schema_class]["uri"] - for record in self.schema["@graph"]: - usage = {} - if record["@type"] == "rdf:Property": - if "schema:rangeIncludes" in record: - p_range = dict2list(record["schema:rangeIncludes"]) - for _doc in p_range: - if _doc["@id"] == schema_uri: - usage["property"] = record["rdfs:label"] - p_domain = dict2list(record["schema:domainIncludes"]) - usage["property_used_on_class"] = unlist( - [self.uri2label(record["@id"]) for record in p_domain] - ) - usage["description"] = record["rdfs:comment"] - if usage: - usages.append(usage) - return usages - - def find_child_classes(self, schema_class): - """Find schema classes that inherit from the given class""" - return unlist(list(self.schema_nx.successors(schema_class))) - - def find_adjacent_child_classes(self, schema_class): - - return self.get_adjacent_nodes_by_relationship(schema_class, "parentOf") - - def explore_class(self, schema_class): - """Find details about a specific schema class""" - parents = [] - if "subClassOf" in self.schema_nx.nodes[schema_class]: - schema_node_val = self.schema_nx.nodes[schema_class]["subClassOf"] - - parents_list = [] - if isinstance(schema_node_val, dict): - parents_list.append(self.schema_nx.nodes[schema_class]["subClassOf"]) - else: - parents_list = schema_node_val - - for parent in parents_list: - parents.append(extract_name_from_uri_or_curie(parent["@id"])) - - requires_range = [] - if "rangeIncludes" in self.schema_nx.nodes[schema_class]: - schema_node_val = self.schema_nx.nodes[schema_class]["rangeIncludes"] - - if isinstance(schema_node_val, dict): - subclass_list = [] - subclass_list.append( - self.schema_nx.nodes[schema_class]["rangeIncludes"] - ) - else: - subclass_list = schema_node_val - - for range_class in subclass_list: - requires_range.append( - extract_name_from_uri_or_curie(range_class["@id"]) - ) - - requires_dependencies = [] - if "requiresDependency" in self.schema_nx.nodes[schema_class]: - schema_node_val = self.schema_nx.nodes[schema_class]["requiresDependency"] - - if isinstance(schema_node_val, dict): - subclass_list = [] - subclass_list.append( - self.schema_nx.nodes[schema_class]["requiresDependency"] - ) - else: - subclass_list = schema_node_val - - for dep_class in subclass_list: - requires_dependencies.append( - extract_name_from_uri_or_curie(dep_class["@id"]) - ) - - requires_components = [] - if "requiresComponent" in self.schema_nx.nodes[schema_class]: - schema_node_val = self.schema_nx.nodes[schema_class]["requiresComponent"] - - if isinstance(schema_node_val, dict): - subclass_list = [] - subclass_list.append( - self.schema_nx.nodes[schema_class]["requiresComponent"] - ) - else: - subclass_list = schema_node_val - - for comp_dep_class in subclass_list: - requires_components.append( - extract_name_from_uri_or_curie(comp_dep_class["@id"]) - ) - - required = False - if "required" in self.schema_nx.nodes[schema_class]: - required = self.schema_nx.nodes[schema_class]["required"] - - validation_rules = [] - if "validationRules" in self.schema_nx.nodes[schema_class]: - validation_rules = self.schema_nx.nodes[schema_class]["validationRules"] - - # TODO: make class_info keys here the same as keys in schema graph nodes(e.g. schema_class above); note that downstream code using explore_class would have to be updated as well (e.g. csv_2_schemaorg) - - class_info = { - "properties": self.find_class_specific_properties(schema_class), - "description": self.schema_nx.nodes[schema_class]["description"], - "uri": curie2uri(self.schema_nx.nodes[schema_class]["uri"], namespaces), - #'usage': self.find_class_usages(schema_class), - "usage": "NA", - "child_classes": self.find_adjacent_child_classes(schema_class), - "subClassOf": parents, - "range": requires_range, - "dependencies": requires_dependencies, - "validation_rules": validation_rules, - "required": required, - "component_dependencies": requires_components, - "parent_classes": parents - #'parent_classes': self.find_parent_classes(schema_class) - } - - if "displayName" in self.schema_nx.nodes[schema_class]: - class_info["displayName"] = self.schema_nx.nodes[schema_class][ - "displayName" - ] - - return class_info - - def get_class_validation_rules(self,class_label): - rules=[] - class_info = self.explore_class(class_label) - - if 'validation_rules' in class_info: - rules=class_info['validation_rules'] - - return rules - - def get_property_label_from_display_name(self, display_name, strict_camel_case = False): - """Convert a given display name string into a proper property label string""" - """ - label = ''.join(x.capitalize() or ' ' for x in display_name.split(' ')) - label = label[:1].lower() + label[1:] if label else '' - """ - # This is the newer more strict method - if strict_camel_case: - display_name = display_name.strip().translate({ord(c): "_" for c in string.whitespace}) - label = inflection.camelize(display_name, uppercase_first_letter=False) - - # This method remains for backwards compatibility - else: - display_name = display_name.translate({ord(c): None for c in string.whitespace}) - label = inflection.camelize(display_name.strip(), uppercase_first_letter=False) - - return label - - def get_class_label_from_display_name(self, display_name, strict_camel_case = False): - """Convert a given display name string into a proper class label string""" - """ - label = ''.join(x.capitalize() or ' ' for x in display_name.split(' '))""" - # This is the newer more strict method - if strict_camel_case: - display_name = display_name.strip().translate({ord(c): "_" for c in string.whitespace}) - label = inflection.camelize(display_name, uppercase_first_letter=True) - - # This method remains for backwards compatibility - else: - display_name = display_name.translate({ord(c): None for c in string.whitespace}) - label = inflection.camelize(display_name.strip(), uppercase_first_letter=True) - - return label - - def get_class_by_property(self, property_display_name): - schema_property = self.get_property_label_from_display_name( - property_display_name - ) - - for record in self.schema["@graph"]: - if record["@type"] == "rdf:Property": - if record["rdfs:label"] == schema_property: - p_domain = record["schema:domainIncludes"] - - return [ - self.uri2label(record["@id"]) - for record in p_domain - ] - #return unlist( - # [ - # self.uri2label(schema_class["@id"]) - # for schema_class in p_domain - # ] - #) - - return None - - def uri2label(self, uri): - return uri.split(":")[1] - - def explore_property(self, schema_property): - """Find details about a specific property - TODO: refactor so that explore class and explore property reuse logic - they are *very* similar - """ - property_info = {} - for record in self.schema["@graph"]: - if record["@type"] == "rdf:Property": - if record["rdfs:label"] == schema_property: - property_info["id"] = record["rdfs:label"] - property_info["description"] = record["rdfs:comment"] - property_info["uri"] = curie2uri(record["@id"], namespaces) - - p_domain = record["schema:domainIncludes"] - if type(p_domain) == list: - property_info["domain"] = [self.uri2label(record["@id"]) for record in p_domain] - elif type(p_domain) == dict: - property_info["domain"] = [self.uri2label(record["@id"])] - - if "schema:rangeIncludes" in record: - p_range = dict2list(record["schema:rangeIncludes"]) - property_info["range"] = [ - self.uri2label(record["@id"]) for record in p_range - ] - else: - property_info["range"] = [] - - if "sms:required" in record: - if "sms:true" == record["sms:required"]: - property_info["required"] = True - else: - property_info["required"] = False - - validation_rules = [] - if "sms:validationRules" in record: - property_info["validation_rules"] = record[ - "sms:validationRules" - ] - - if "sms:requiresDependency" in record: - p_dependencies = dict2list(record["sms:requiresDependency"]) - property_info["dependencies"] = [ - self.uri2label(record["@id"]) for record in p_dependencies - ] - else: - property_info["dependencies"] = [] - - if "sms:displayName" in record: - property_info["displayName"] = record["sms:displayName"] - - break - - # check if properties are added multiple times - - return property_info - - def generate_class_template(self): - """Generate a template for schema class""" - template = { - "@id": "uri or curie of the class", - "@type": "rdfs:Class", - "rdfs:comment": "description of the class", - "rdfs:label": "class label, should match @id", - "rdfs:subClassOf": {"@id": "parent class, could be list"}, - "schema:isPartOf": {"@id": "http://schema.biothings.io"}, - } - return template - - def generate_property_template(self): - """Generate a template for schema property""" - template = { - "@id": "url or curie of the property", - "@type": "rdf:Property", - "rdfs:comment": "description of the property", - "rdfs:label": "carmel case, should match @id", - "schema:domainIncludes": { - "@id": "class which use it as a property, could be list" - }, - "schema:isPartOf": {"@id": "http://schema.biothings.io"}, - "schema:rangeIncludes": { - "@id": "relates a property to a class that constitutes (one of) the expected type(s) for values of the property" - }, - } - return template - - def edit_class(self, class_info): - """Edit an existing class into schema""" - for i, schema_class in enumerate(self.schema["@graph"]): - if schema_class["rdfs:label"] == class_info["rdfs:label"]: - validate_class_schema(class_info) # why are we doing this in a loop? - - self.schema["@graph"][i] = class_info - break - - # TODO: do we actually need to validate the entire schema if a class is just edited and the class passes validation? - # validate_schema(self.schema) - - logger.info(f"Edited the class {class_info['rdfs:label']} successfully.") - self.schema_nx = load_schema_into_networkx(self.schema) - - def update_class(self, class_info): - """Add a new class into schema""" - # print(class_info) - validate_class_schema(class_info) - self.schema["@graph"].append(class_info) - validate_schema(self.schema) - logger.info(f"Updated the class {class_info['rdfs:label']} successfully.") - self.schema_nx = load_schema_into_networkx(self.schema) - - def edit_property(self, property_info): - """Edit an existing property into schema""" - for i, schema_property in enumerate(self.schema["@graph"]): - if schema_property["rdfs:label"] == property_info["rdfs:label"]: - validate_property_schema(property_info) - self.schema["@graph"][i] = property_info - - # TODO: check if properties are added/edited multiple times (e.g. look at explore_property) - break - - validate_schema(self.schema) - logger.info(f"Edited the property {property_info['rdfs:label']} successfully.") - self.schema_nx = load_schema_into_networkx(self.schema) - - def update_property(self, property_info): - """Add a new property into schema""" - validate_property_schema(property_info) - self.schema["@graph"].append(property_info) - validate_schema(self.schema) - logger.info(f"Updated the property {property_info['rdfs:label']} successfully.") - - def get_nodes_descendants(self, graph, component): - """ - Return a list of nodes reachable from source in graph - graph: networkx graph object - component: any given node - """ - all_descendants = list(nx.descendants(graph, component)) - - return all_descendants - - def get_nodes_ancestors(self, graph, component): - """ - Return a list of nodes reachable from source in graph - graph: networkx graph object - component: any given node - """ - all_ancestors = list(nx.ancestors(graph, component)) - - return all_ancestors - - def get_digraph_by_edge_type(self, edge_type): - - multi_digraph = self.schema_nx - - digraph = nx.DiGraph() - for (u, v, key, c) in multi_digraph.edges(data=True, keys=True): - if key == edge_type: - digraph.add_edge(u, v) - - # print(nx.find_cycle(digraph, orientation = "ignore")) - - return digraph - - # version of edit_class() method that directly acts on the networkx graph - def edit_schema_object_nx(self, schema_object: dict) -> None: - node_to_replace = class_to_node(class_to_convert=schema_object) - - # get the networkx graph associated with the SchemaExplorer object in its current state - schema_graph_nx = self.get_nx_schema() - - # outer loop to loop over all the nodes in the graph constructed from master schema - for node, data in schema_graph_nx.nodes(data=True): - - # innner loop to loop over the single node that is to be replaced/edited in the master graph - for replace_node, replace_data in node_to_replace.nodes(data=True): - - # find the node to be replaced in the graph - if node == replace_node: - - # for the "comment", "required", "displayName", "validationRules" fields/keys it's okay to do a direct replacement - # without having to worry about adding/removing any associated edges - - # ques. is it more expensive to do a checking operation (diff b/w fields) or a replace operation? - - if ( - "comment" in data and "comment" in replace_data - ): # replace contents of "comment" from replacement node - schema_graph_nx.nodes[node]["comment"] = node_to_replace.nodes[ - replace_node - ]["comment"] - schema_graph_nx.nodes[node][ - "description" - ] = node_to_replace.nodes[replace_node]["description"] - - if ( - "required" in data and "required" in replace_data - ): # replace boolean value of "required" from replacement node - schema_graph_nx.nodes[node]["required"] = node_to_replace.nodes[ - replace_node - ]["required"] - - if ( - "displayName" in data and "displayName" in replace_data - ): # replace contents of "displayName" from replacement node - schema_graph_nx.nodes[node][ - "displayName" - ] = node_to_replace.nodes[replace_node]["displayName"] - - if ( - "validationRules" in data and "validationRules" in replace_data - ): # replace contents of "validationRules" from replacement node - schema_graph_nx.nodes[node][ - "validationRules" - ] = node_to_replace.nodes[replace_node]["validationRules"] - - # for the "subClassOf", "requiresDependency", "requiresComponent", "rangeIncludes" fields/keys require rejiggering - # of associated edges - # general strategy we follow for rejiggering is remove edges that existed formerly and add new edges based on contents - # of the replacement node - - # "subClassOf" key related edge manipulation - if "subClassOf" in replace_data: - - # if the "subClassOf" attribute already exists on the node, then remove all the "parentOf" in-edges - # associated with that node - if "subClassOf" in data: - # remove formerly existent edges from the master schema/graph - for (u, v) in list(schema_graph_nx.in_edges([node])): - - # there are certain nodes which have "subClassOf" data in list format - if type(data["subClassOf"]) == list: - for _edges_to_replace in data["subClassOf"]: - edge_repl = extract_name_from_uri_or_curie( - _edges_to_replace["@id"] - ) - - if edge_repl == u: - - try: - # we need to make sure to remove only edges that are tagged with the "parentOf" label - schema_graph_nx.remove_edges_from( - [(u, v, "parentOf")] - ) - except TypeError: - pass - - # there are certain nodes which have "subClassOf" data in dict format - elif type(data["subClassOf"]) == dict: - for k_id, v_curie in data["subClassOf"].items(): - edge_repl = extract_name_from_uri_or_curie( - v_curie - ) - - if edge_repl == u: - - try: - schema_graph_nx.remove_edges_from( - [(u, v, "parentOf")] - ) - except TypeError: - pass - - # extract node names from replacement node and use it to add edges to the master schema/graph - parents = replace_data["subClassOf"] - if type(parents) == list: - for _parent in parents: - target_node = extract_name_from_uri_or_curie( - _parent["@id"] - ) - - # label to be associated with "subClassOf" keys is "parentOf" - if target_node != replace_node: - - # make note of the fact that we are changing in-edges here - schema_graph_nx.add_edge( - target_node, replace_node, key="parentOf" - ) - elif type(parents) == dict: - for _k_parent, _v_parent in parents.items(): - target_node = extract_name_from_uri_or_curie(_v_parent) - - # label to be associated with "subClassOf" keys is "parentOf" - if target_node != replace_node: - - # make note of the fact that we are changing in-edges here - schema_graph_nx.add_edge( - target_node, replace_node, key="parentOf" - ) - - # once the edges have been added, change the contents of the node - schema_graph_nx.nodes[node][ - "subClassOf" - ] = node_to_replace.nodes[replace_node]["subClassOf"] - - # "requiresDependency" key related edge manipulation - if "requiresDependency" in replace_data: - - # if the "requiresDependency" attribute already exists on the node, then remove all the "requiresDependency" in-edges - # associated with that node - if "requiresDependency" in data: - - for (u, v) in list(schema_graph_nx.out_edges([node])): - # there are certain nodes which have "requiresDependency" data in list format - if type(data["requiresDependency"]) == list: - for _edges_to_replace in data["requiresDependency"]: - edge_repl = extract_name_from_uri_or_curie( - _edges_to_replace["@id"] - ) - - if edge_repl == v: - - try: - schema_graph_nx.remove_edges_from( - [u, v, "requiresDependency"] - ) - except TypeError: - pass - - # there are certain nodes which have "requiresDependency" data in dict format - elif type(data["requiresDependency"]) == dict: - for k_id, v_curie in data[ - "requiresDependency" - ].items(): - edge_repl = extract_name_from_uri_or_curie( - v_curie - ) - - if edge_repl == u: - - try: - schema_graph_nx.remove_edges_from( - [u, v, "requiresDependency"] - ) - except TypeError: - pass - - deps = replace_data["requiresDependency"] - if type(deps) == list: - for _dep in deps: - target_node = extract_name_from_uri_or_curie( - _dep["@id"] - ) - - if target_node != replace_node: - - # make not of the fact that edges being added here are out-edges - schema_graph_nx.add_edge( - replace_node, - target_node, - key="requiresDependency", - ) - elif type(deps) == dict: - for _k_dep, _v_dep in deps.items(): - target_node = extract_name_from_uri_or_curie(_v_dep) - - if target_node != replace_node: - - # make not of the fact that edges being added here are out-edges - schema_graph_nx.add_edge( - replace_node, - target_node, - key="requiresDependency", - ) - - schema_graph_nx.nodes[node][ - "requiresDependency" - ] = node_to_replace.nodes[replace_node]["requiresDependency"] - - # "requiresComponent" key related edge manipulation - if "requiresComponent" in replace_data: - - if "requiresComponent" in data: - for (u, v) in list(schema_graph_nx.out_edges([node])): - # there are certain nodes which have "requiresComponent" data in list format - if type(data["requiresComponent"]) == list: - for _edges_to_replace in data["requiresComponent"]: - edge_repl = extract_name_from_uri_or_curie( - _edges_to_replace["@id"] - ) - - if edge_repl == v: - - try: - schema_graph_nx.remove_edges_from( - [u, v, "requiresComponent"] - ) - except TypeError: - pass - - elif type(data["requiresComponent"]) == dict: - for k_id, v_curie in data[ - "requiresComponent" - ].items(): - edge_repl = extract_name_from_uri_or_curie( - v_curie - ) - - if edge_repl == v: - - try: - schema_graph_nx.remove_edges_from( - [u, v, "requiresComponent"] - ) - except TypeError: - pass - - comps = replace_data["requiresComponent"] - if type(comps) == list: - for _comp in comps: - target_node = extract_name_from_uri_or_curie( - _comp["@id"] - ) - - if target_node != replace_node: - schema_graph_nx.add_edge( - replace_node, - target_node, - key="requiresComponent", - ) - elif type(comps) == dict: - for _k_comp, _v_comp in deps.items(): - target_node = extract_name_from_uri_or_curie(_v_comp) - - if target_node != replace_node: - - # make not of the fact that edges being added here are out-edges - schema_graph_nx.add_edge( - replace_node, - target_node, - key="requiresDependency", - ) - - schema_graph_nx.nodes[node][ - "requiresComponent" - ] = node_to_replace.nodes[replace_node]["requiresComponent"] - - # "rangeIncludes" key related edge manipulation - if "rangeIncludes" in replace_data: - - if "rangeIncludes" in data: - for (u, v) in list(schema_graph_nx.out_edges([node])): - # there are certain nodes which have "rangeIncludes" data in list format - if type(data["rangeIncludes"]) == list: - for _edges_to_replace in data["rangeIncludes"]: - edge_repl = extract_name_from_uri_or_curie( - _edges_to_replace["@id"] - ) - - if edge_repl == v: - try: - schema_graph_nx.remove_edges_from( - [u, v, "rangeIncludes"] - ) - except TypeError: - pass - - elif type(data["rangeIncludes"]) == dict: - for k_id, v_curie in data["rangeIncludes"].items(): - edge_repl = extract_name_from_uri_or_curie( - v_curie - ) - - if edge_repl == v: - try: - schema_graph_nx.remove_edges_from( - [u, v, "rangeIncludes"] - ) - except TypeError: - pass - - range_inc = replace_data["rangeIncludes"] - if type(range_inc) == list: - for _rinc in range_inc: - target_node = extract_name_from_uri_or_curie( - _rinc["@id"] - ) - - if target_node != replace_node: - schema_graph_nx.add_edge( - replace_node, target_node, key="rangeValue" - ) - elif type(range_inc) == dict: - for _k_rinc, _v_rinc in deps.items(): - target_node = extract_name_from_uri_or_curie(_v_rinc) - - if target_node != replace_node: - - # make not of the fact that edges being added here are out-edges - schema_graph_nx.add_edge( - replace_node, target_node, key="rangeValue" - ) - - schema_graph_nx.nodes[node][ - "rangeIncludes" - ] = node_to_replace.nodes[replace_node]["rangeIncludes"] - - # set the networkx schema graph to the the modified networkx schema - self.schema_nx = schema_graph_nx - - # print("Added node {} to the graph successfully.".format(schema_object["rdfs:label"])) - - # part of the code that replaces the modified class in the original JSON-LD schema (not in the data/ folder though) - for i, schema_class in enumerate(self.schema["@graph"]): - if schema_class["rdfs:label"] == schema_object["rdfs:label"]: - # validate_class_schema(schema_object) # validate that the class to be modified follows the structure for any generic class (node) - - self.schema["@graph"][i] = schema_object - break - - # version of update_class() method that directly acts on the networkx graph - def add_schema_object_nx(self, schema_object: dict, **kwargs: dict) -> None: - node = node_attrs_cleanup(schema_object) - - if "required" in node: - if "sms:true" == schema_object["sms:required"]: - node["required"] = True - else: - node["required"] = False - - if "sms:validationRules" in schema_object: - node["validationRules"] = schema_object["sms:validationRules"] - else: - node["validationRules"] = [] - - node["uri"] = schema_object["@id"] - node["description"] = schema_object["rdfs:comment"] - - # get the networkx graph associated with the SchemaExplorer object in its current state - schema_graph_nx = self.get_nx_schema() - - # add node to graph - schema_graph_nx.add_node(schema_object["rdfs:label"], **node) - - schema_graph_nx = relationship_edges(schema_graph_nx, schema_object, **kwargs) - - # set the networkx schema graph to the the modified networkx schema - self.schema_nx = schema_graph_nx - - # print("Edited node {} successfully.".format(schema_object["rdfs:label"])) - - # update the JSON-LD schema after modifying the networkx graph - # validate_class_schema(schema_object) - self.schema["@graph"].append(schema_object) - # validate_schema(self.schema) \ No newline at end of file diff --git a/schematic/schemas/generator.py b/schematic/schemas/generator.py deleted file mode 100644 index 530147736..000000000 --- a/schematic/schemas/generator.py +++ /dev/null @@ -1,723 +0,0 @@ -import gc -import os -import json -import logging -from typing import Any, Dict, Optional, Text, List - -import networkx as nx - -from schematic.schemas.explorer import SchemaExplorer -from schematic.utils.io_utils import load_json -from schematic.utils.cli_utils import query_dict -from schematic.utils.schema_utils import load_schema_into_networkx -from schematic.utils.validate_utils import validate_schema, rule_in_rule_list - - -logger = logging.getLogger(__name__) - - -class SchemaGenerator(object): - def __init__( - self, - path_to_json_ld: str = None, - schema_explorer: SchemaExplorer = None, - requires_dependency_relationship: str = "requiresDependency", # optional parameter(s) with default value - requires_range: str = "rangeIncludes", - range_value_relationship: str = "rangeValue", - requires_component_relationship: str = "requiresComponent", - ) -> None: - """Create / Initialize object of type SchemaGenerator(). - - Methods / utilities that are part of this module can be used to generate JSON validation schemas for different schema.org - specification models. - - Args: - path_to_json_ld: Path to the JSON-LD file that is representing the schema.org data model that we want to validate. - schema_explorer: SchemaExplorer instance containing the schema.org data model that we want to validate. - requires_dependency_relationship: Edge relationship between two nodes indicating that they are dependent on each other. - requires_range: A node propertly indicating that a term can assume a value equal to any of the terms that are in the current term's range. - range_value_relationship: Edge relationship that indicates a term / node that another node depends on, is part of the other node's range. - requires_component_relationship: A node property indicating that this node requires a component for its full characterization. - - Returns: - None - """ - - self.jsonld_path = path_to_json_ld - - if schema_explorer is None: - - assert ( - self.jsonld_path is not None - ), "You must provide either `path_to_json_ld` or `schema_explorer`." - - self.jsonld_path_root, jsonld_ext = os.path.splitext(self.jsonld_path) - - assert jsonld_ext == ".jsonld", ( - "Please make sure the 'path_to_json_ld' parameter " - "is pointing to a valid JSON-LD file." - ) - - # create an instance of SchemaExplorer - self.se = SchemaExplorer() - - # convert the JSON-LD data model to networkx object - self.se.load_schema(self.jsonld_path) - - else: - - # Confirm that given SchemaExplorer instance is valid - assert ( - getattr(schema_explorer, "schema") is not None - and getattr(schema_explorer, "schema_nx") is not None - ), ( - "SchemaExplorer instance given to `schema_explorer` argument " - "does not have both the `schema` and `schema_nx` attributes." - ) - - # User given instance of SchemaExplorer - self.se = schema_explorer - - # custom value(s) of following relationship attributes are passed during initialization - self.requires_dependency_relationship = requires_dependency_relationship - self.requires_range = requires_range - self.range_value_relationship = range_value_relationship - self.requires_component_relationship = requires_component_relationship - - def get_edges_by_relationship(self, node: str, relationship: str) -> List[str]: - """ - See class definition in SchemaExplorer - TODO: possibly remove this wrapper and refactor downstream code to call from SchemaExplorer - """ - - return self.se.get_edges_by_relationship(node, relationship) - - def get_adjacent_nodes_by_relationship( - self, node: str, relationship: str - ) -> List[str]: - - """ - See class definition in SchemaExplorer - TODO: possibly remove this wrapper and refactor downstream code to call from SchemaExplorer - """ - - return self.se.get_adjacent_nodes_by_relationship(node, relationship) - - def get_subgraph_by_edge_type( - self, graph: nx.MultiDiGraph, relationship: str - ) -> nx.DiGraph: - """Get a subgraph containing all edges of a given type (aka relationship). - TODO: possibly move method to SchemaExplorer and refactor downstream code to call from SchemaExplorer - - Args: - graph: input multi digraph (aka hypergraph) - relationship: edge / link relationship type with possible values same as in above docs. - - Returns: - Directed graph on edges of a particular type (aka relationship) - """ - - # prune the metadata model graph so as to include only those edges that match the relationship type - rel_edges = [] - for (u, v, key, c) in graph.out_edges(data=True, keys=True): - if key == relationship: - rel_edges.append((u, v)) - - relationship_subgraph = nx.DiGraph() - relationship_subgraph.add_edges_from(rel_edges) - - return relationship_subgraph - - def get_descendants_by_edge_type( - self, - source_node: str, - relationship: str, - connected: bool = True, - ordered: bool = False, - ) -> List[str]: - - """ - See class definition in SchemaExplorer - TODO: possibly remove this wrapper and refactor downstream code to call from SchemaExplorer - """ - - return self.se.get_descendants_by_edge_type( - source_node, relationship, connected, ordered - ) - - def get_component_requirements(self, source_component: str) -> List[str]: - """Get all components that are associated with a given source component and are required by it. - - Args: - source_component: source component for which we need to find all required downstream components. - - Returns: - List of nodes that are descendants from the source component are are related to the source through a specific component relationship. - """ - - req_components = list( - reversed( - self.get_descendants_by_edge_type( - source_component, self.requires_component_relationship, ordered=True - ) - ) - ) - - return req_components - - def get_component_requirements_graph(self, source_component: str) -> nx.DiGraph: - """Get all components that are associated with a given source component and are required by it; return the components as a dependency graph (i.e. a DAG). - - Args: - source_component: source component for which we need to find all required downstream components. - - Returns: - A subgraph of the schema graph induced on nodes that are descendants from the source component and are related to the source through a specific component relationship. - """ - - # get a list of required component nodes - req_components = self.get_component_requirements(source_component) - - # get the schema graph - mm_graph = self.se.get_nx_schema() - - # get the subgraph induced on required component nodes - req_components_graph = self.get_subgraph_by_edge_type( - mm_graph, self.requires_component_relationship - ).subgraph(req_components) - - return req_components_graph - - def get_node_dependencies( - self, source_node: str, display_names: bool = True, schema_ordered: bool = True - ) -> List[str]: - """Get the immediate dependencies that are related to a given source node. - - Args: - source_node: The node whose dependencies we need to compute. - display_names: if True, return list of display names of each of the dependencies. - if False, return list of node labels of each of the dependencies. - schema_ordered: if True, return the dependencies of the node following the order of the schema (slower). - if False, return dependencies from graph without guaranteeing schema order (faster) - - Returns: - List of nodes that are dependent on the source node. - """ - mm_graph = self.se.get_nx_schema() - - if schema_ordered: - # get dependencies in the same order in which they are defined in the schema - required_dependencies = self.se.explore_class(source_node)["dependencies"] - else: - required_dependencies = self.get_adjacent_nodes_by_relationship( - source_node, self.requires_dependency_relationship - ) - - if display_names: - # get display names of dependencies - dependencies_display_names = [] - - for req in required_dependencies: - dependencies_display_names.append(mm_graph.nodes[req]["displayName"]) - - return dependencies_display_names - - return required_dependencies - - def get_node_range(self, node_label: str, display_names: bool = True) -> List[str]: - """Get the range, i.e., all the valid values that are associated with a node label. - - Args: - node_label: Node / termn for which you need to retrieve the range. - - Returns: - List of display names of nodes associateed with the given node. - """ - mm_graph = self.se.get_nx_schema() - - try: - # get node range in the order defined in schema for given node - required_range = self.se.explore_class(node_label)["range"] - except KeyError: - raise ValueError( - f"The source node {node_label} does not exist in the graph. " - "Please use a different node." - ) - - if display_names: - # get the display name(s) of all dependencies - dependencies_display_names = [] - - for req in required_range: - dependencies_display_names.append(mm_graph.nodes[req]["displayName"]) - - return dependencies_display_names - - return required_range - - def get_node_label(self, node_display_name: str) -> str: - """Get the node label for a given display name. - - Args: - node_display_name: Display name of the node which you want to get the label for. - - Returns: - Node label associated with given node. - - Raises: - KeyError: If the node cannot be found in the graph. - """ - mm_graph = self.se.get_nx_schema() - - node_class_label = self.se.get_class_label_from_display_name(node_display_name) - node_property_label = self.se.get_property_label_from_display_name( - node_display_name - ) - - if node_class_label in mm_graph.nodes: - node_label = node_class_label - elif node_property_label in mm_graph.nodes: - node_label = node_property_label - else: - node_label = "" - - return node_label - - def get_node_definition(self, node_display_name: str) -> str: - """Get the node definition, i.e., the "comment" associated with a given node display name. - - Args: - node_display_name: Display name of the node which you want to get the label for. - - Returns: - Comment associated with node, as a string. - """ - node_label = self.get_node_label(node_display_name) - - if not node_label: - return "" - - mm_graph = self.se.get_nx_schema() - node_definition = mm_graph.nodes[node_label]["comment"] - - return node_definition - - def get_node_validation_rules(self, node_display_name: str) -> str: - """Get validation rules associated with a node, - - Args: - node_display_name: Display name of the node which you want to get the label for. - - Returns: - A set of validation rules associated with node, as a list. - """ - node_label = self.get_node_label(node_display_name) - - if not node_label: - return [] - - mm_graph = self.se.get_nx_schema() - node_validation_rules = mm_graph.nodes[node_label]["validationRules"] - - return node_validation_rules - - def is_node_required(self, node_display_name: str) -> bool: - """Check if a given node is required or not. - - Note: The possible options that a node can be associated with -- "required" / "optional". - - Args: - node_display_name: Display name of the node which you want to get the label for. - - Returns: - True: If the given node is a "required" node. - False: If the given node is not a "required" (i.e., an "optional") node. - """ - node_label = self.get_node_label(node_display_name) - - mm_graph = self.se.get_nx_schema() - node_required = mm_graph.nodes[node_label]["required"] - - return node_required - - def get_node_display_name(self, node_label: str, mm_graph: nx.MultiDiGraph) -> list: - """Get display name associated with a given node label, return id if no display name. - Args: - node_label, str: Node to retrieve display name for - Returns: - node_display_name: display name of the node, or its id if it does not have a display name. - """ - if "displayName" in mm_graph.nodes[node_label]: - node_display_name = mm_graph.nodes[node_label]["displayName"] - else: - node_display_name = mm_graph.nodes[node_label]["id"].split(':')[1] - return node_display_name - - def get_nodes_display_names( - self, node_list: List[str], mm_graph: nx.MultiDiGraph - ) -> List[str]: - """Get display names associated with the given list of nodes. - - Args: - node_list: List of nodes whose display names we need to retrieve. - - Returns: - List of display names, return id if no display name - """ - - node_list_display_names = [self.get_node_display_name(node, mm_graph) for node in node_list] - - return node_list_display_names - - def get_range_schema( - self, node_range: List[str], node_name: str, blank=False - ) -> Dict[str, Dict[str, List[str]]]: - """Add a list of nodes to the "enum" key in a given JSON schema object. - - Args: - node_name: Name of the "main" / "head" key in the JSON schema / object. - node_range: List of nodes to be added to the JSON object. - blank: If True, add empty node to end of node list. - If False, do not add empty node to end of node list. - - Returns: - JSON object with nodes. - """ - if blank: - schema_node_range = {node_name: {"enum": node_range + [""]}} - else: - schema_node_range = {node_name: {"enum": node_range}} - - return schema_node_range - - def get_array_schema( - self, node_range: List[str], node_name: str, blank=False - ) -> Dict[str, Dict[str, List[str]]]: - """Add a list of nodes to the "enum" key in a given JSON schema object. - Allow a node to be mapped to any subset of the list - - Args: - node_name: Name of the "main" / "head" key in the JSON schema / object. - node_range: List of nodes to be added to the JSON object. - blank: If True, add empty node to end of node list. - If False, do not add empty node to end of node list. - - Returns: - JSON object with array validation rule. - """ - - schema_node_range_array = { - node_name: { - "type": "array", - "items": {"enum": node_range + [""] if blank else node_range}, - "maxItems": len(node_range), - } - } - - return schema_node_range_array - - def get_non_blank_schema( - self, node_name: str - ) -> Dict: # can't define heterogenous Dict generic types - """Get a schema rule that does not allow null or empty values. - - Args: - node_name: Name of the node on which the schema rule is to be applied. - - Returns: - Schema rule as a JSON object. - """ - non_blank_schema = {node_name: {"not": {"type": "null"}, "minLength": 1}} - - return non_blank_schema - - def is_required(self, node_name: str, mm_graph: nx.MultiDiGraph) -> bool: - """ - Check if a node is required - - Args: - node_name: Name of the node on which the check is to be applied. - - Returns: - Boolean value indicating if the node is required or not. - True: yes, it is required. - False: no, it is not required. - Return False, if no required key - """ - if "required" in mm_graph.nodes[node_name]: - return mm_graph.nodes[node_name]["required"] - else: - return False - - def get_json_schema_requirements(self, source_node: str, schema_name: str) -> Dict: - """Consolidated method that aims to gather dependencies and value constraints across terms / nodes in a schema.org schema and store them in a jsonschema /JSON Schema schema. - - It does so for any given node in the schema.org schema (recursively) using the given node as starting point in the following manner: - 1) Find all the nodes / terms this node depends on (which are required as "additional metadata" given this node is "required"). - 2) Find all the allowable metadata values / nodes that can be assigned to a particular node (if such a constraint is specified on the schema). - - Args: - source_node: Node from which we can start recursive dependancy traversal (as mentioned above). - schema_name: Name assigned to JSON-LD schema (to uniquely identify it via URI when it is hosted on the Internet). - - Returns: - JSON Schema as a dictionary. - """ - json_schema = { - "$schema": "http://json-schema.org/draft-07/schema#", - "$id": "http://example.com/" + schema_name, - "title": schema_name, - "type": "object", - "properties": {}, - "required": [], - "allOf": [], - } - - # get graph corresponding to data model schema - mm_graph = self.se.get_nx_schema() - - nodes_to_process = ( - [] - ) # list of nodes to be checked for dependencies, starting with the source node - processed_nodes = ( - [] - ) # keep of track of nodes whose dependencies have been processed - reverse_dependencies = ( - {} - ) # maintain a map between conditional nodes and their dependencies (reversed) -- {dependency : conditional_node} - range_domain_map = ( - {} - ) # maintain a map between range nodes and their domain nodes {range_value : domain_value} - # the domain node is very likely the parentof ("parentOf" relationship) of the range node - - root_dependencies = self.get_adjacent_nodes_by_relationship( - source_node, self.requires_dependency_relationship - ) - - # if root_dependencies is empty it means that a class with name 'source_node' exists - # in the schema, but it is not a valid component - if not root_dependencies: - raise ValueError(f"'{source_node}' is not a valid component in the schema.") - - nodes_to_process += root_dependencies - - process_node = nodes_to_process.pop(0) - - while process_node: - - if not process_node in processed_nodes: - # node is being processed - node_is_processed = True - - node_range = self.get_adjacent_nodes_by_relationship( - process_node, self.range_value_relationship - ) - - # get node range display name - node_range_d = self.get_nodes_display_names(node_range, mm_graph) - - node_dependencies = self.get_adjacent_nodes_by_relationship( - process_node, self.requires_dependency_relationship - ) - - # get process node display name - node_display_name = self.get_node_display_name(node_label=process_node, mm_graph=mm_graph) - - # updating map between node and node's valid values - for n in node_range_d: - if not n in range_domain_map: - range_domain_map[n] = [] - range_domain_map[n].append(node_display_name) - - # can this node be map to the empty set (if required no; if not required yes) - # TODO: change "required" to different term, required may be a bit misleading (i.e. is the node required in the schema) - node_required = self.is_required(process_node, mm_graph) - - # get any additional validation rules associated with this node (e.g. can this node be mapped to a list of other nodes) - node_validation_rules = self.get_node_validation_rules( - node_display_name - ) - - if node_display_name in reverse_dependencies: - # if node has conditionals set schema properties and conditional dependencies - # set schema properties - if node_range: - # if process node has valid value range set it in schema properties - schema_valid_vals = self.get_range_schema( - node_range_d, node_display_name, blank=True - ) - - if node_validation_rules: - # if this node has extra validation rules process them - # TODO: abstract this into its own validation rule constructor/generator module/class - if rule_in_rule_list("list", node_validation_rules): - # if this node can be mapped to a list of nodes - # set its schema accordingly - schema_valid_vals = self.get_array_schema( - node_range_d, node_display_name, blank=True - ) - - else: - # otherwise, by default allow any values - schema_valid_vals = {node_display_name: {}} - - json_schema["properties"].update(schema_valid_vals) - - # set schema conditional dependencies - for node in reverse_dependencies[node_display_name]: - # set all of the conditional nodes that require this process node - - # get node domain if any - # ow this node is a conditional requirement - if node in range_domain_map: - domain_nodes = range_domain_map[node] - conditional_properties = {} - - for domain_node in domain_nodes: - - # set range of conditional node schema - conditional_properties.update( - { - "properties": {domain_node: {"enum": [node]}}, - "required": [domain_node], - } - ) - - # given node conditional are satisfied, this process node (which is dependent on these conditionals) has to be set or not depending on whether it is required - if node_range: - dependency_properties = self.get_range_schema( - node_range_d, - node_display_name, - blank=not node_required, - ) - - if node_validation_rules: - if rule_in_rule_list("list", node_validation_rules): - # TODO: get_range_schema and get_range_schema have similar behavior - combine in one module - dependency_properties = self.get_array_schema( - node_range_d, - node_display_name, - blank=not node_required, - ) - - else: - if node_required: - dependency_properties = self.get_non_blank_schema( - node_display_name - ) - else: - dependency_properties = {node_display_name: {}} - schema_conditional_dependencies = { - "if": conditional_properties, - "then": { - "properties": dependency_properties, - "required": [node_display_name], - }, - } - - # update conditional-dependency rules in json schema - json_schema["allOf"].append( - schema_conditional_dependencies - ) - - else: - # node doesn't have conditionals - if node_required: - if node_range: - schema_valid_vals = self.get_range_schema( - node_range_d, node_display_name, blank=False - ) - - if node_validation_rules: - # If there are valid values AND they are expected to be a list, - # reformat the Valid Values. - if rule_in_rule_list("list", node_validation_rules): - schema_valid_vals = self.get_array_schema( - node_range_d, node_display_name, blank=False - ) - else: - schema_valid_vals = self.get_non_blank_schema( - node_display_name - ) - - json_schema["properties"].update(schema_valid_vals) - # add node to required fields - json_schema["required"] += [node_display_name] - - elif process_node in root_dependencies: - # node doesn't have conditionals and is not required; it belongs in the schema only if it is in root's dependencies - - if node_range: - schema_valid_vals = self.get_range_schema( - node_range_d, node_display_name, blank=True - ) - - if node_validation_rules: - if rule_in_rule_list("list", node_validation_rules): - schema_valid_vals = self.get_array_schema( - node_range_d, node_display_name, blank=True - ) - - else: - schema_valid_vals = {node_display_name: {}} - - json_schema["properties"].update(schema_valid_vals) - - else: - # node doesn't have conditionals and it is not required and it is not a root dependency - # the node doesn't belong in the schema - # do not add to processed nodes since its conditional may be traversed at a later iteration (though unlikely for most schemas we consider) - node_is_processed = False - - # add process node as a conditional to its dependencies - node_dependencies_d = self.get_nodes_display_names( - node_dependencies, mm_graph - ) - - for dep in node_dependencies_d: - if not dep in reverse_dependencies: - reverse_dependencies[dep] = [] - - reverse_dependencies[dep].append(node_display_name) - - # add nodes found as dependencies and range of this processed node - # to the list of nodes to be processed - nodes_to_process += node_range - nodes_to_process += node_dependencies - - # if the node is processed add it to the processed nodes set - if node_is_processed: - processed_nodes.append(process_node) - - # if the list of nodes to process is not empty - # set the process node the next remaining node to process - if nodes_to_process: - process_node = nodes_to_process.pop(0) - else: - # no more nodes to process - # exit the loop - break - - logger.info("JSON schema successfully generated from schema.org schema!") - - # if no conditional dependencies were added we can't have an empty 'AllOf' block in the schema, so remove it - if not json_schema["allOf"]: - del json_schema["allOf"] - - # If no config value and SchemaGenerator was initialized with - # a JSON-LD path, construct - if self.jsonld_path is not None: - prefix = self.jsonld_path_root - prefix_root, prefix_ext = os.path.splitext(prefix) - if prefix_ext == ".model": - prefix = prefix_root - json_schema_log_file = f"{prefix}.{source_node}.schema.json" - - logger.info( - "The JSON schema file can be inspected by setting the following " - "nested key in the configuration: (model > input > log_location)." - ) - - logger.info(f"JSON schema file log stored as {json_schema_log_file}") - - return json_schema diff --git a/schematic/schemas/validator.py b/schematic/schemas/json_schema_validator.py similarity index 100% rename from schematic/schemas/validator.py rename to schematic/schemas/json_schema_validator.py diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index fa08e09ba..c291b742e 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -1,24 +1,25 @@ -from datetime import datetime, timedelta +import atexit +from collections import OrderedDict from copy import deepcopy -import os -import uuid # used to generate unique names for entities +from datetime import datetime, timedelta +from dataclasses import dataclass import json -import atexit import logging +import numpy as np +import pandas as pd +import os +import re import secrets -from dataclasses import dataclass import shutil +import synapseclient +import uuid # used to generate unique names for entities +from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed, retry_if_exception_type +from time import sleep # allows specifying explicit variable types from typing import Dict, List, Tuple, Sequence, Union, Optional -from collections import OrderedDict -from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed, retry_if_exception_type -import numpy as np -import pandas as pd -import re -import synapseclient -from time import sleep + from synapseclient import ( Synapse, File, @@ -30,7 +31,6 @@ Column, as_table_columns, ) - from synapseclient.entity import File from synapseclient.table import CsvFileTable, build_table, Schema from synapseclient.annotations import from_synapse_annotations @@ -42,19 +42,26 @@ from schematic_db.rdb.synapse_database import SynapseDatabase +from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.utils.df_utils import update_df, load_df, col_in_dataframe from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list -from schematic.utils.general import entity_type_mapping, get_dir_size, convert_gb_to_bytes, create_temp_folder, check_synapse_cache_size, clear_synapse_cache -from schematic.schemas.explorer import SchemaExplorer -from schematic.schemas.generator import SchemaGenerator +# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment +# Please do not remove these import statements +from schematic.utils.general import (entity_type_mapping, + get_dir_size, + convert_gb_to_bytes, + create_temp_folder, + check_synapse_cache_size, + clear_synapse_cache, + profile, + calculate_datetime) +from schematic.utils.schema_utils import get_class_label_from_display_name + from schematic.store.base import BaseStorage from schematic.exceptions import MissingConfigValueError, AccessCredentialsError - from schematic.configuration.configuration import CONFIG -from schematic.utils.general import profile, calculate_datetime - logger = logging.getLogger("Synapse storage") @dataclass @@ -626,10 +633,11 @@ def fill_in_entity_id_filename(self, datasetId: str, manifest: pd.DataFrame) -> manifest = manifest.fillna("") return dataset_files, manifest - def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store:bool = True) -> Union[Tuple[str, pd.DataFrame], None]: + def updateDatasetManifestFiles(self, dmge: DataModelGraphExplorer, datasetId: str, store:bool = True) -> Union[Tuple[str, pd.DataFrame], None]: """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. Args: + dmge: DataModelGraphExplorer Instance datasetId: synapse ID of a storage dataset. store: if set to True store updated manifest in asset store; if set to False return a Pandas dataframe containing updated manifest but do not store to asset store @@ -662,7 +670,7 @@ def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store: manifest.to_csv(manifest_filepath, index=False) # store manifest and update associated metadata with manifest on Synapse - manifest_id = self.associateMetadataWithFiles(sg, manifest_filepath, datasetId) + manifest_id = self.associateMetadataWithFiles(dmge, manifest_filepath, datasetId) return manifest_id, manifest @@ -802,7 +810,7 @@ def getProjectManifests( return manifests - def upload_project_manifests_to_synapse(self, sg: SchemaGenerator, projectId: str) -> List[str]: + def upload_project_manifests_to_synapse(self, dmge: DataModelGraphExplorer, projectId: str) -> List[str]: """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. Returns: String of all the manifest_table_ids of all the manifests that have been loaded. @@ -824,7 +832,7 @@ def upload_project_manifests_to_synapse(self, sg: SchemaGenerator, projectId: st manifest_name = manifest_info["properties"]["name"] manifest_path = manifest_info["path"] manifest_df = load_df(manifest_path) - manifest_table_id = uploadDB(sg=sg, manifest=manifest, datasetId=datasetId, table_name=datasetName) + manifest_table_id = uploadDB(dmge=dmge, manifest=manifest, datasetId=datasetId, table_name=datasetName) manifest_loaded.append(datasetName) return manifest_loaded @@ -835,8 +843,20 @@ def upload_annotated_project_manifests_to_synapse(self, projectId:str, path_to_j Assumes the manifest is already present as a CSV in a dataset in the project. ''' + # Instantiate DataModelParser + data_model_parser = DataModelParser(path_to_data_model = path_to_json_ld) + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + #Instantiate DataModelGraphExplorer + dmge = DataModelGraphExplorer(graph_data_model) - sg = SchemaGenerator(path_to_json_ld) manifests = [] manifest_loaded = [] datasets = self.getStorageDatasetsInProject(projectId) @@ -855,7 +875,7 @@ def upload_annotated_project_manifests_to_synapse(self, projectId:str, path_to_j manifest_path = manifest_info["path"] manifest = ((datasetId, datasetName), (manifest_id, manifest_name), ("", "")) if not dry_run: - manifest_syn_id = self.associateMetadataWithFiles(sg, manifest_path, datasetId, manifest_record_type='table') + manifest_syn_id = self.associateMetadataWithFiles(dmge, manifest_path, datasetId, manifest_record_type='table') manifest_loaded.append(manifest) return manifests, manifest_loaded @@ -954,7 +974,7 @@ def get_table_info(self, datasetId: str = None, projectId: str = None) -> List[s @missing_entity_handler def uploadDB(self, - sg: SchemaGenerator, + dmge: DataModelGraphExplorer, manifest: pd.DataFrame, datasetId: str, table_name: str, @@ -966,7 +986,7 @@ def uploadDB(self, Method to upload a database to an asset store. In synapse, this will upload a metadata table Args: - sg: schemaGenerator object + dmge: DataModelGraphExplorer object manifest: pd.Df manifest to upload datasetId: synID of the dataset for the manifest table_name: name of the table to be uploaded @@ -983,18 +1003,18 @@ def uploadDB(self, """ - col_schema, table_manifest = self.formatDB(sg=sg, manifest=manifest, useSchemaLabel=useSchemaLabel) + col_schema, table_manifest = self.formatDB(dmge=dmge, manifest=manifest, useSchemaLabel=useSchemaLabel) - manifest_table_id = self.buildDB(datasetId, table_name, col_schema, table_manifest, table_manipulation, sg, restrict,) + manifest_table_id = self.buildDB(datasetId, table_name, col_schema, table_manifest, table_manipulation, dmge, restrict,) return manifest_table_id, manifest, table_manifest - def formatDB(self, sg, manifest, useSchemaLabel): + def formatDB(self, dmge, manifest, useSchemaLabel): """ Method to format a manifest appropriatly for upload as table Args: - sg: schemaGenerator object + dmge: DataModelGraphExplorer object manifest: pd.Df manifest to upload useSchemaLabel: bool whether to use schemaLabel (True) or display label (False) @@ -1012,7 +1032,7 @@ def formatDB(self, sg, manifest, useSchemaLabel): if useSchemaLabel: cols = [ - sg.se.get_class_label_from_display_name( + get_class_label_from_display_name( str(col) ).translate({ord(x): '' for x in blacklist_chars}) for col in manifest_columns @@ -1044,7 +1064,7 @@ def buildDB(self, col_schema: List, table_manifest: pd.DataFrame, table_manipulation: str, - sg: SchemaGenerator, + dmge: DataModelGraphExplorer, restrict: bool = False, ): @@ -1090,7 +1110,7 @@ def buildDB(self, if table_manipulation.lower() == 'replace': manifest_table_id = tableOps.replaceTable(specifySchema = True, columnTypeDict=col_schema,) elif table_manipulation.lower() == 'upsert': - manifest_table_id = tableOps.upsertTable(sg=sg,) + manifest_table_id = tableOps.upsertTable(dmge=dmge,) elif table_manipulation.lower() == 'update': manifest_table_id = tableOps.updateTable() @@ -1134,7 +1154,7 @@ def upload_manifest_file(self, manifest, metadataManifestPath, datasetId, restri return manifest_synapse_file_id @missing_entity_handler - def format_row_annotations(self, se, sg, row, entityId, hideBlanks): + def format_row_annotations(self, dmge, row, entityId, hideBlanks): # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest # this could create a divergence between manifest column and annotations. this should be ok for most use cases. @@ -1144,7 +1164,7 @@ def format_row_annotations(self, se, sg, row, entityId, hideBlanks): for k, v in row.to_dict().items(): - keySyn = se.get_class_label_from_display_name(str(k)).translate({ord(x): '' for x in blacklist_chars}) + keySyn = get_class_label_from_display_name(str(k)).translate({ord(x): '' for x in blacklist_chars}) # Skip `Filename` and `ETag` columns when setting annotations if keySyn in ["Filename", "ETag", "eTag"]: @@ -1172,7 +1192,7 @@ def format_row_annotations(self, se, sg, row, entityId, hideBlanks): else: if isinstance(anno_v,float) and np.isnan(anno_v): annos[anno_k] = "" - elif isinstance(anno_v,str) and re.fullmatch(csv_list_regex, anno_v) and rule_in_rule_list('list', sg.get_node_validation_rules(anno_k)): + elif isinstance(anno_v,str) and re.fullmatch(csv_list_regex, anno_v) and rule_in_rule_list('list', dmge.get_node_validation_rules(anno_k)): annos[anno_k] = anno_v.split(",") else: annos[anno_k] = anno_v @@ -1250,8 +1270,8 @@ def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPa else: manifest["entityId"].fillna("", inplace=True) - # get a schema explorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations - se = SchemaExplorer() + # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations + dmge = DataModelGraphExplorer() # Create table name here. if 'Component' in manifest.columns: @@ -1261,7 +1281,7 @@ def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPa # Upload manifest as a table and get the SynID and manifest manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table( - se, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) + dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed # also set metadata for each synapse entity as Synapse annotations @@ -1321,7 +1341,7 @@ def _read_manifest(self, metadataManifestPath:str) -> pd.DataFrame: ) from err return manifest - def _add_id_columns_to_manifest(self, manifest: pd.DataFrame, sg: SchemaGenerator): + def _add_id_columns_to_manifest(self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer): """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row. Args: Manifest loaded as a pd.Dataframe @@ -1333,7 +1353,7 @@ def _add_id_columns_to_manifest(self, manifest: pd.DataFrame, sg: SchemaGenerato if not col_in_dataframe("Id", manifest): # See if schema has `Uuid` column specified try: - uuid_col_in_schema = sg.se.is_class_in_schema('Uuid') or sg.se.is_class_in_schema('uuid') + uuid_col_in_schema = dmge.is_class_in_schema('Uuid') or dmge.is_class_in_schema('uuid') except (KeyError): uuid_col_in_schema = False @@ -1376,11 +1396,10 @@ def _generate_table_name(self, manifest): table_name = 'synapse_storage_manifest_table' return table_name, component_name - def _add_annotations(self, se, schemaGenerator, row, entityId, hideBlanks): + def _add_annotations(self, dmge, row, entityId, hideBlanks): """Helper function to format and add annotations to entities in Synapse. Args: - se: schemaExplorer object, - schemaGenerator: schemaGenerator Object. + dmge: DataModelGraphExplorer object, row: current row of manifest being processed entityId (str): synapseId of entity to add annotations to hideBlanks: Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. @@ -1388,7 +1407,7 @@ def _add_annotations(self, se, schemaGenerator, row, entityId, hideBlanks): Annotations are added to entities in Synapse, no return. """ # Format annotations for Synapse - annos = self.format_row_annotations(se, schemaGenerator, row, entityId, hideBlanks) + annos = self.format_row_annotations(dmge, row, entityId, hideBlanks) if annos: # Store annotations for an entity folder @@ -1416,8 +1435,7 @@ def _create_entity_id(self, idx, row, manifest, datasetId): def add_annotations_to_entities_files( self, - se, - schemaGenerator, + dmge, manifest, manifest_record_type, datasetId, @@ -1426,8 +1444,7 @@ def add_annotations_to_entities_files( ): '''Depending on upload type add Ids to entityId row. Add anotations to connected files. Args: - se: Schema Explorer Object - schemaGenerator: SchemaGenerator object + dmge: DataModelGraphExplorer Object manifest (pd.DataFrame): loaded df containing user supplied data. manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. datasetId (str): synapse ID of folder containing the dataset @@ -1464,14 +1481,13 @@ def add_annotations_to_entities_files( # Adding annotations to connected files. if entityId: - self._add_annotations(se, schemaGenerator, row, entityId, hideBlanks) + self._add_annotations(dmge, row, entityId, hideBlanks) logger.info(f"Added annotations to entity: {entityId}") return manifest def upload_manifest_as_table( self, - se, - schemaGenerator, + dmge, manifest, metadataManifestPath, datasetId, @@ -1485,8 +1501,7 @@ def upload_manifest_as_table( ): """Upload manifest to Synapse as a table and csv. Args: - se: SchemaExplorer object - schemaGenerator: SchemaGenerator Object + dmge: DataModelGraphExplorer object manifest (pd.DataFrame): loaded df containing user supplied data. metadataManifestPath: path to csv containing a validated metadata manifest. datasetId (str): synapse ID of folder containing the dataset @@ -1501,7 +1516,7 @@ def upload_manifest_as_table( """ # Upload manifest as a table, get the ID and updated manifest. manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( - sg=schemaGenerator, + dmge=dmge, manifest=manifest, datasetId=datasetId, table_name=table_name, @@ -1509,7 +1524,7 @@ def upload_manifest_as_table( useSchemaLabel=useSchemaLabel, table_manipulation=table_manipulation) - manifest = self.add_annotations_to_entities_files(se, schemaGenerator, manifest, manifest_record_type, datasetId, hideBlanks, manifest_synapse_table_id) + manifest = self.add_annotations_to_entities_files(dmge, manifest, manifest_record_type, datasetId, hideBlanks, manifest_synapse_table_id) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict, component_name = component_name) @@ -1520,10 +1535,10 @@ def upload_manifest_as_table( # Update manifest Synapse table with new entity id column. manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( - sg=schemaGenerator, - manifest=manifest, - datasetId=datasetId, - table_name=table_name, + dmge=dmge, + manifest=manifest, + datasetId=datasetId, + table_name=table_name, restrict=restrict, useSchemaLabel=useSchemaLabel, table_manipulation='update') @@ -1535,8 +1550,7 @@ def upload_manifest_as_table( def upload_manifest_as_csv( self, - se, - schemaGenerator, + dmge, manifest, metadataManifestPath, datasetId, @@ -1546,8 +1560,7 @@ def upload_manifest_as_csv( component_name): """Upload manifest to Synapse as a csv only. Args: - se: SchemaExplorer object - schemaGenerator: SchemaGenerator Object + dmge: DataModelGraphExplorer object manifest (pd.DataFrame): loaded df containing user supplied data. metadataManifestPath: path to csv containing a validated metadata manifest. datasetId (str): synapse ID of folder containing the dataset @@ -1559,8 +1572,8 @@ def upload_manifest_as_csv( Return: manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. """ - # remove with_entities parameter and rename add_annotations, as add_annototaions_to_files_entities. - manifest = self.add_annotations_to_entities_files(se, schemaGenerator, manifest, manifest_record_type, datasetId, hideBlanks) + + manifest = self.add_annotations_to_entities_files(dmge, manifest, manifest_record_type, datasetId, hideBlanks) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file(manifest, @@ -1576,8 +1589,7 @@ def upload_manifest_as_csv( def upload_manifest_combo( self, - se, - schemaGenerator, + dmge, manifest, metadataManifestPath, datasetId, @@ -1591,8 +1603,7 @@ def upload_manifest_combo( ): """Upload manifest to Synapse as a table and CSV with entities. Args: - se: SchemaExplorer object - schemaGenerator: SchemaGenerator Object + dmge: DataModelGraphExplorer object manifest (pd.DataFrame): loaded df containing user supplied data. metadataManifestPath: path to csv containing a validated metadata manifest. datasetId (str): synapse ID of folder containing the dataset @@ -1607,7 +1618,7 @@ def upload_manifest_combo( manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. """ manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( - sg=schemaGenerator, + dmge=dmge, manifest=manifest, datasetId=datasetId, table_name=table_name, @@ -1615,7 +1626,7 @@ def upload_manifest_combo( useSchemaLabel=useSchemaLabel, table_manipulation=table_manipulation) - manifest = self.add_annotations_to_entities_files(se, schemaGenerator, manifest, manifest_record_type, datasetId, hideBlanks, manifest_synapse_table_id) + manifest = self.add_annotations_to_entities_files(dmge, manifest, manifest_record_type, datasetId, hideBlanks, manifest_synapse_table_id) # Load manifest to synapse as a CSV File manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict, component_name) @@ -1627,7 +1638,7 @@ def upload_manifest_combo( # Update manifest Synapse table with new entity id column. manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( - sg=schemaGenerator, + dmge=dmge, manifest=manifest, datasetId=datasetId, table_name=table_name, @@ -1641,7 +1652,7 @@ def upload_manifest_combo( return manifest_synapse_file_id def associateMetadataWithFiles( - self, schemaGenerator: SchemaGenerator, metadataManifestPath: str, datasetId: str, manifest_record_type: str = 'table_file_and_entities', + self, dmge: DataModelGraphExplorer, metadataManifestPath: str, datasetId: str, manifest_record_type: str = 'table_file_and_entities', useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False, table_manipulation: str = 'replace', ) -> str: """Associate metadata with files in a storage dataset already on Synapse. @@ -1656,7 +1667,7 @@ def associateMetadataWithFiles( for downstream query and interaction with the data. Args: - schemaGenerator: SchemaGenerator Object + dmge: DataModelGraphExplorer Object metadataManifestPath: path to csv containing a validated metadata manifest. The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. @@ -1672,10 +1683,7 @@ def associateMetadataWithFiles( """ # Read new manifest CSV: manifest = self._read_manifest(metadataManifestPath) - manifest = self._add_id_columns_to_manifest(manifest, schemaGenerator) - - # get a schema explorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations - se = SchemaExplorer() + manifest = self._add_id_columns_to_manifest(manifest, dmge) table_name, component_name = self._generate_table_name(manifest) @@ -1683,8 +1691,7 @@ def associateMetadataWithFiles( if manifest_record_type == "file_only": manifest_synapse_file_id = self.upload_manifest_as_csv( - se, - schemaGenerator, + dmge, manifest, metadataManifestPath, datasetId=datasetId, @@ -1695,8 +1702,7 @@ def associateMetadataWithFiles( ) elif manifest_record_type == "table_and_file": manifest_synapse_file_id = self.upload_manifest_as_table( - se, - schemaGenerator, + dmge, manifest, metadataManifestPath, datasetId=datasetId, @@ -1710,8 +1716,7 @@ def associateMetadataWithFiles( ) elif manifest_record_type == "file_and_entities": manifest_synapse_file_id = self.upload_manifest_as_csv( - se, - schemaGenerator, + dmge, manifest, metadataManifestPath, datasetId=datasetId, @@ -1722,8 +1727,7 @@ def associateMetadataWithFiles( ) elif manifest_record_type == "table_file_and_entities": manifest_synapse_file_id = self.upload_manifest_combo( - se, - schemaGenerator, + dmge, manifest, metadataManifestPath, datasetId=datasetId, @@ -2208,7 +2212,7 @@ def _get_auth_token(self,): return authtoken - def upsertTable(self, sg: SchemaGenerator,): + def upsertTable(self, dmge: DataModelGraphExplorer): """ Method to upsert rows from a new manifest into an existing table on synapse For upsert functionality to work, primary keys must follow the naming convention of _id @@ -2217,7 +2221,7 @@ def upsertTable(self, sg: SchemaGenerator,): Args: - sg: SchemaGenerator instance + dmge: DataModelGraphExplorer instance Returns: existingTableId: synID of the already existing table that had its metadata replaced @@ -2233,7 +2237,7 @@ def upsertTable(self, sg: SchemaGenerator,): except(SynapseHTTPError) as ex: # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload if 'Id is not a valid column name or id' in str(ex): - self._update_table_uuid_column(sg) + self._update_table_uuid_column(dmge) synapseDB.upsert_table_rows(table_name=self.tableName, data=self.tableToLoad) # Raise if other error else: @@ -2241,12 +2245,12 @@ def upsertTable(self, sg: SchemaGenerator,): return self.existingTableId - def _update_table_uuid_column(self, sg: SchemaGenerator,) -> None: + def _update_table_uuid_column(self, dmge: DataModelGraphExplorer,) -> None: """Removes the `Uuid` column when present, and relpaces with an `Id` column Used to enable backwards compatability for manifests using the old `Uuid` convention Args: - sg: SchemaGenerator instance + dmge: DataModelGraphExplorer instance Returns: None @@ -2261,7 +2265,7 @@ def _update_table_uuid_column(self, sg: SchemaGenerator,) -> None: if col.name.lower() == 'uuid': # See if schema has `Uuid` column specified try: - uuid_col_in_schema = sg.se.is_class_in_schema(col.name) + uuid_col_in_schema = dmge.is_class_in_schema(col.name) except (KeyError): uuid_col_in_schema = False diff --git a/schematic/utils/schema_utils.py b/schematic/utils/schema_utils.py index 553ac4fb4..b8cab8e66 100644 --- a/schematic/utils/schema_utils.py +++ b/schematic/utils/schema_utils.py @@ -1,265 +1,126 @@ -import networkx as nx +import inflection import json - -from schematic.utils.curie_utils import extract_name_from_uri_or_curie -from schematic.utils.validate_utils import validate_class_schema -from schematic.utils.validate_rules_utils import validate_schema_rules - - -def load_schema_into_networkx(schema): - G = nx.MultiDiGraph() - for record in schema["@graph"]: - - # TODO: clean up obsolete code - # if record["@type"] == "rdfs:Class": - - # creation of nodes - # adding nodes to the graph - node = {} - for (k, value) in record.items(): - # Some keys in the current schema.org schema have a dictionary entry for their value that includes keys @language and @value, - # for parity with other schemas, we just want the value - if isinstance(value,dict) and "@language" in value.keys(): - record[k] = record[k]["@value"] - if ":" in k: - key = k.split(":")[1] - node[key] = value - elif "@" in k: - key = k[1:] - node[key] = value - else: - node[k] = value - - # creation of edges - # adding edges to the graph - if "rdfs:subClassOf" in record: - parents = record["rdfs:subClassOf"] - if type(parents) == list: - for _parent in parents: - n1 = extract_name_from_uri_or_curie(_parent["@id"]) - n2 = record["rdfs:label"] - - # do not allow self-loops - if n1 != n2: - G.add_edge(n1, n2, key="parentOf") - elif type(parents) == dict: - n1 = extract_name_from_uri_or_curie(parents["@id"]) - n2 = record["rdfs:label"] - - # do not allow self-loops - if n1 != n2: - G.add_edge(n1, n2, key="parentOf") - - # TODO: refactor: abstract adding relationship method - if "sms:requiresDependency" in record: - dependencies = record["sms:requiresDependency"] - if type(dependencies) == list: - for _dep in dependencies: - n1 = record["rdfs:label"] - n2 = extract_name_from_uri_or_curie(_dep["@id"]) - # do not allow self-loops - if n1 != n2: - G.add_edge(n1, n2, key="requiresDependency") - - if "sms:requiresComponent" in record: - components = record["sms:requiresComponent"] - if type(components) == list: - for _comp in components: - n1 = record["rdfs:label"] - n2 = extract_name_from_uri_or_curie(_comp["@id"]) - # do not allow self-loops - if n1 != n2: - G.add_edge(n1, n2, key="requiresComponent") - - if "schema:rangeIncludes" in record: - range_nodes = record["schema:rangeIncludes"] - if type(range_nodes) == list: - for _range_node in range_nodes: - n1 = record["rdfs:label"] - n2 = extract_name_from_uri_or_curie(_range_node["@id"]) - # do not allow self-loops - if n1 != n2: - G.add_edge(n1, n2, key="rangeValue") - elif type(range_nodes) == dict: - n1 = record["rdfs:label"] - n2 = extract_name_from_uri_or_curie(range_nodes["@id"]) - # do not allow self-loops - if n1 != n2: - G.add_edge(n1, n2, key="rangeValue") - - if "schema:domainIncludes" in record: - domain_nodes = record["schema:domainIncludes"] - if type(domain_nodes) == list: - for _domain_node in domain_nodes: - n1 = extract_name_from_uri_or_curie(_domain_node["@id"]) - n2 = record["rdfs:label"] - # do not allow self-loops - if n1 != n2: - G.add_edge(n1, n2, key="domainValue") - elif type(domain_nodes) == dict: - n1 = extract_name_from_uri_or_curie(domain_nodes["@id"]) - n2 = record["rdfs:label"] - # do not allow self-loops - if n1 != n2: - G.add_edge(n1, n2, key="domainValue") - - # check schema generator (JSON validation schema gen) - if ( - "requiresChildAsValue" in node - and node["requiresChildAsValue"]["@id"] == "sms:True" - ): - node["requiresChildAsValue"] = True - - if "required" in node: - if "sms:true" == record["sms:required"]: - node["required"] = True - else: - node["required"] = False - - # not sure if this is required? - if "sms:validationRules" in record: - node["validationRules"] = record["sms:validationRules"] - if node["validationRules"]: - validate_vr = validate_schema_rules( - record["sms:validationRules"], - record["rdfs:label"], - input_filetype = 'json_schema') - else: - node["validationRules"] = [] - - node["uri"] = record["@id"] - node["description"] = record["rdfs:comment"] - G.add_node(record["rdfs:label"], **node) - # print(node) - # print(G.nodes()) - - return G - - -def node_attrs_cleanup(class_add_mod: dict) -> dict: - # clean map that will be inputted into the node/graph - node = {} - for (k, value) in class_add_mod.items(): - if ":" in k: - key = k.split(":")[1] - node[key] = value - elif "@" in k: - key = k[1:] - node[key] = value +import networkx as nx +import string +from typing import List, Dict + +def attr_dict_template(key_name:str)->Dict[str,dict[str,dict]]: + return {key_name: {'Relationships': {}}} + +def get_property_label_from_display_name(display_name:str, strict_camel_case:bool = False) -> str: + """Convert a given display name string into a proper property label string + Args: + display_name, str: node display name + strict_camel_case, bool: Default, False; defines whether or not to use strict camel case or not for conversion. + Returns: + label, str: property label of display name + """ + # This is the newer more strict method + if strict_camel_case: + display_name = display_name.strip().translate({ord(c): "_" for c in string.whitespace}) + label = inflection.camelize(display_name, uppercase_first_letter=False) + + # This method remains for backwards compatibility else: - node[k] = value - - return node - - -def relationship_edges( - schema_graph_nx: nx.MultiDiGraph, class_add_mod: dict, **kwargs -) -> nx.MultiDiGraph: - """ - Notes: - ===== - # pass the below dictionary as the third argument (kwargs) to relationship_edges(). - # "in" indicates that the relationship has an in-edges behaviour. - # "out" indicates that the relationship has an out-edges behaviour. - - rel_dict = { - "rdfs:subClassOf": { - "parentOf": "in" - }, - "schema:domainIncludes": { - "domainValue": "in" - }, - "sms:requiresDependency": { - "requiresDependency": "out" - }, - "sms:requiresComponent": { - "requiresComponent": "out" - }, - "schema:rangeIncludes": { - "rangeValue": "out" - } - } + display_name = display_name.translate({ord(c): None for c in string.whitespace}) + label = inflection.camelize(display_name.strip(), uppercase_first_letter=False) + + return label + +def get_class_label_from_display_name(display_name:str, strict_camel_case:bool = False) -> str: + """Convert a given display name string into a proper class label string + Args: + display_name, str: node display name + strict_camel_case, bool: Default, False; defines whether or not to use strict camel case or not for conversion. + Returns: + label, str: class label of display name """ - for rel, rel_lab_node_type in kwargs.items(): - for rel_label, node_type in rel_lab_node_type.items(): - if rel in class_add_mod: - parents = class_add_mod[rel] - if type(parents) == list: - for _parent in parents: - - if node_type == "in": - n1 = extract_name_from_uri_or_curie(_parent["@id"]) - n2 = class_add_mod["rdfs:label"] - - if node_type == "out": - n1 = class_add_mod["rdfs:label"] - n2 = extract_name_from_uri_or_curie(_parent["@id"]) - - # do not allow self-loops - if n1 != n2: - schema_graph_nx.add_edge(n1, n2, key=rel_label) - elif type(parents) == dict: - if node_type == "in": - n1 = extract_name_from_uri_or_curie(parents["@id"]) - n2 = class_add_mod["rdfs:label"] - - if node_type == "out": - n1 = class_add_mod["rdfs:label"] - n2 = extract_name_from_uri_or_curie(parents["@id"]) - - # do not allow self-loops - if n1 != n2: - schema_graph_nx.add_edge(n1, n2, key=rel_label) - - return schema_graph_nx - - -def class_to_node(class_to_convert: dict) -> nx.Graph: - G = nx.Graph() - - node = {} # node to be added the above graph and returned - for (k, v) in class_to_convert.items(): - if ":" in k: # if ":" is present in key - key = k.split(":")[1] - node[key] = v - elif "@" in k: # if "@" is present in key - key = k[1:] - node[key] = v - else: - node[k] = v - - if "required" in node: - if class_to_convert["sms:required"] == "sms:true": - node["required"] = True - else: - node["required"] = False + # This is the newer more strict method + if strict_camel_case: + display_name = display_name.strip().translate({ord(c): "_" for c in string.whitespace}) + label = inflection.camelize(display_name, uppercase_first_letter=True) - if "sms:validationRules" in class_to_convert: - node["validationRules"] = class_to_convert["sms:validationRules"] + # This method remains for backwards compatibility else: - node["validationRules"] = [] - - node["uri"] = class_to_convert["@id"] # add separate "uri" key - node["description"] = class_to_convert[ - "rdfs:comment" - ] # separately store "comment" as "description" - G.add_node(class_to_convert["rdfs:label"], **node) - - return G - - -def replace_node_in_schema(schema: nx.MultiDiGraph, class_add_mod: dict) -> None: - # part of the code that replaces the modified class in the original JSON-LD schema (not in the data/ folder though) - for i, schema_class in enumerate(schema["@graph"]): - if schema_class["rdfs:label"] == class_add_mod["rdfs:label"]: - validate_class_schema( - class_add_mod - ) # validate that the class to be modified follows the structure for any generic class (node) - - schema["@graph"][i] = class_add_mod - break - + display_name = display_name.translate({ord(c): None for c in string.whitespace}) + label = inflection.camelize(display_name.strip(), uppercase_first_letter=True) + + return label + +def get_attribute_display_name_from_label(node_name: str, attr_relationships: dict) -> str: + '''Get attribute display name for a node, using the node label, requires the attr_relationships dicitonary from the data model parser + Args: + node_name, str: node label + attr_relationships, dict: dictionary defining attributes and relationships, generated in data model parser. + Returns: + display_name, str: node display name, recorded in attr_relationships. + ''' + if 'Attribute' in attr_relationships.keys(): + display_name = attr_relationships['Attribute'] + else: + display_name = node_name + return display_name + +def get_label_from_display_name(display_name:str, entry_type:str, strict_camel_case:bool = False) -> str: + """Get node label from provided display name, based on whether the node is a class or property + Args: + display_name, str: node display name + entry_type, str: 'class' or 'property', defines what type the entry is. + strict_camel_case, bool: Default, False; defines whether or not to use strict camel case or not for conversion. + Returns: + label, str: class label of display name + Raises: + ValueError if entry_type.lower(), is not either 'class' or 'property' -def export_schema(schema, file_path): + """ + if entry_type.lower()=='class': + label = get_class_label_from_display_name(display_name=display_name, strict_camel_case=strict_camel_case) + + elif entry_type.lower()=='property': + label=get_property_label_from_display_name(display_name=display_name, strict_camel_case=strict_camel_case) + else: + raise ValueError(f"The entry type submitted: {entry_type}, is not one of the permitted types: 'class' or 'property'") + return label + +def convert_bool_to_str(provided_bool: bool) -> str: + """Convert bool to string. + Args: + provided_bool, str: true or false bool + Returns: + Boolean converted to 'true' or 'false' str as appropriate. + """ + return str(provided_bool) + +def parse_validation_rules(validation_rules:List[str]) -> List[str]: + """Split multiple validation rules based on :: delimiter + Args: + validation_rules, list: list containing a string validation rule + Returns: + validation_rules, list: if submitted List + """ + if validation_rules and '::' in validation_rules[0]: + validation_rules = validation_rules[0].split('::') + return validation_rules + +def export_schema(schema: dict, file_path: str) -> None: + """Export schema to given filepath. + Args: + schema, dict: JSONLD schema + filepath, str: path to store the schema + """ with open(file_path, "w") as f: json.dump(schema, f, sort_keys=True, indent=4, ensure_ascii=False) + +def strip_context(context_value: str) -> tuple[str]: + """Strip contexts from str entry. + Args: + context_value, str: string from which to strip context from + Returns: + context, str: the original context + v, str: value separated from context + """ + if ':' in context_value: + context, v = context_value.split(':') + elif '@' in context_value: + context, v = context_value.split('@') + return context, v diff --git a/schematic/visualization/attributes_explorer.py b/schematic/visualization/attributes_explorer.py index 0b18ab092..0917172dd 100644 --- a/schematic/visualization/attributes_explorer.py +++ b/schematic/visualization/attributes_explorer.py @@ -6,7 +6,10 @@ import pandas as pd from typing import Any, Dict, Optional, Text, List -from schematic.schemas import SchemaGenerator +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_json_schema import DataModelJSONSchema + from schematic.utils.io_utils import load_json logger = logging.getLogger(__name__) @@ -17,12 +20,27 @@ def __init__(self, )-> None: self.path_to_jsonld = path_to_jsonld - self.json_data_model = load_json(self.path_to_jsonld) + self.jsonld = load_json(self.path_to_jsonld) - # instantiate a schema generator to retrieve db schema graph from metadata model graph - self.sg = SchemaGenerator(self.path_to_jsonld) + # Instantiate Data Model Parser + data_model_parser = DataModelParser(path_to_data_model = self.path_to_jsonld) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + # Generate graph + self.graph_data_model = data_model_grapher.generate_data_model_graph() + + # Instantiate Data Model Graph Explorer + self.dmge = DataModelGraphExplorer(self.graph_data_model) + + # Instantiate Data Model Json Schema + self.data_model_js = DataModelJSONSchema(jsonld_path=self.path_to_jsonld, graph=self.graph_data_model) + self.output_path = self.create_output_path('merged_csv') def create_output_path(self, terminal_folder): @@ -62,7 +80,7 @@ def parse_attributes(self, save_file=True): ''' # get all components - component_dg = self.sg.se.get_digraph_by_edge_type('requiresComponent') + component_dg = self.dmge.get_digraph_by_edge_type('requiresComponent') components = component_dg.nodes() # For each data type to be loaded gather all attribtes the user would @@ -115,9 +133,9 @@ def _parse_attributes(self, components, save_file=True, include_index=True): df_store = [] for component in components: data_dict = {} + # get the json schema - json_schema = self.sg.get_json_schema_requirements( - source_node=component, schema_name=self.path_to_jsonld) + json_schema = self.data_model_js.get_json_validation_schema(source_node=component, schema_name=self.path_to_jsonld) # Gather all attribues, their valid values and requirements for key, value in json_schema['properties'].items(): diff --git a/schematic/visualization/tangled_tree.py b/schematic/visualization/tangled_tree.py index d5b838e85..83635a39c 100644 --- a/schematic/visualization/tangled_tree.py +++ b/schematic/visualization/tangled_tree.py @@ -12,8 +12,11 @@ from schematic.utils.viz_utils import visualize from schematic.visualization.attributes_explorer import AttributesExplorer -from schematic.schemas.explorer import SchemaExplorer -from schematic.schemas.generator import SchemaGenerator + +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_relationships import DataModelRelationships + from schematic import LOADER from schematic.utils.io_utils import load_json from copy import deepcopy @@ -40,11 +43,20 @@ def __init__(self, # Parse schema name self.schema_name = path.basename(self.path_to_json_ld).split(".model.jsonld")[0] - # Instantiate a schema generator to retrieve db schema graph from metadata model graph - self.sg = SchemaGenerator(self.path_to_json_ld) + # Instantiate Data Model Parser + data_model_parser = DataModelParser(path_to_data_model = self.path_to_json_ld) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + self.graph_data_model = data_model_grapher.generate_data_model_graph() - # Get metadata model schema graph - self.G = self.sg.se.get_nx_schema() + # Instantiate Data Model Graph Explorer + self.dmge = DataModelGraphExplorer(self.graph_data_model) # Set Parameters self.figure_type = figure_type.lower() @@ -80,14 +92,14 @@ def get_text_for_tangled_tree(self, text_type, save_file=False): save_file==False: Returns plain or highlighted text as a csv string. ''' # Get nodes in the digraph, many more nodes returned if figure type is dependency - cdg = self.sg.se.get_digraph_by_edge_type(self.dependency_type) + cdg = self.dmge.get_digraph_by_edge_type(self.dependency_type) nodes = cdg.nodes() if self.dependency_type == 'requiresComponent': component_nodes = nodes else: # get component nodes if making dependency figure - component_dg = self.sg.se.get_digraph_by_edge_type('requiresComponent') + component_dg = self.dmge.get_digraph_by_edge_type('requiresComponent') component_nodes = component_dg.nodes() # Initialize lists @@ -98,7 +110,7 @@ def get_text_for_tangled_tree(self, text_type, save_file=False): for node in component_nodes: # Get the highlighted components based on figure_type if self.figure_type == 'component': - highlight_descendants = self.sg.se.get_descendants_by_edge_type(node, 'requiresComponent') + highlight_descendants = self.dmge.get_descendants_by_edge_type(node, 'requiresComponent') elif self.figure_type == 'dependency': highlight_descendants = [node] @@ -139,12 +151,13 @@ def get_topological_generations(self): edges: (Networkx EdgeDataView) Edges of component or dependency graph. When iterated over it works like a list of tuples. ''' # Get nodes in the digraph - digraph = self.sg.se.get_digraph_by_edge_type(self.dependency_type) + digraph = self.dmge.get_digraph_by_edge_type(self.dependency_type) nodes = digraph.nodes() # Get subgraph - mm_graph = self.sg.se.get_nx_schema() - subg = self.sg.get_subgraph_by_edge_type(mm_graph, self.dependency_type) + #mm_graph = self.sg.se.get_nx_schema() + #subg = self.sg.get_subgraph_by_edge_type(mm_graph, self.dependency_type) + subg = self.dmge.get_subgraph_by_edge_type(self.dependency_type) # Get edges and topological_gen based on figure type. if self.figure_type == 'component': @@ -217,7 +230,7 @@ def gather_component_dependency_info(self, cn, attributes_df): ''' # Gather all component dependency information - component_attributes = self.sg.get_descendants_by_edge_type( + component_attributes = self.dmge.get_descendants_by_edge_type( cn, self.dependency_type, connected=True @@ -727,7 +740,7 @@ def get_ancestors_nodes(self, subgraph, components): """ all_parent_children = {} for component in components: - all_ancestors = self.sg.se.get_nodes_ancestors(subgraph, component) + all_ancestors = self.dmge.get_nodes_ancestors(subgraph=subgraph, node_label=component) all_parent_children[component] = all_ancestors return all_parent_children @@ -768,7 +781,7 @@ def get_tangled_tree_layers(self, save_file=True): if self.figure_type == 'dependency': # Get component digraph and nodes. - component_dg = self.sg.se.get_digraph_by_edge_type('requiresComponent') + component_dg = self.dmge.get_digraph_by_edge_type('requiresComponent') component_nodes = component_dg.nodes() # Get table of attributes. diff --git a/schematic_api/api/openapi/api.yaml b/schematic_api/api/openapi/api.yaml index ba04b659d..7e4912e2c 100644 --- a/schematic_api/api/openapi/api.yaml +++ b/schematic_api/api/openapi/api.yaml @@ -799,7 +799,7 @@ paths: tags: - Schema Operation - /explorer/find_class_specific_properties: + /schemas/find_class_specific_properties: get: summary: Find properties specifically associated with a given class description: Find properties specifically associated with a given class @@ -961,7 +961,7 @@ paths: tags: - Schema Operation - /explorer/get_node_dependencies: + /schemas/get_node_dependencies: get: summary: Get the immediate dependencies that are related to a given source node description: Get the immediate dependencies that are related to a given source node @@ -1007,20 +1007,12 @@ paths: tags: - Schema Operation - /explorer/get_property_label_from_display_name: + /utils/get_property_label_from_display_name: get: summary: Converts a given display name string into a proper property label string description: Converts a given display name string into a proper property label string - operationId: schematic_api.api.routes.get_property_label_from_display_name + operationId: schematic_api.api.routes.get_property_label_from_display_name_route parameters: - - in: query - name: schema_url - schema: - type: string - description: Data Model URL - example: >- - https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.model.jsonld - required: true - in: query name: display_name schema: @@ -1044,7 +1036,7 @@ paths: tags: - Schema Operation - /explorer/get_node_range: + /schemas/get_node_range: get: summary: Get all the valid values that are associated with a node label. description: Get all the valid values that are associated with a node label. diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 4afd8897f..736a0c2db 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -5,6 +5,7 @@ import shutil import urllib.request import logging +import pathlib import pickle import connexion @@ -24,11 +25,15 @@ from schematic.visualization.tangled_tree import TangledTree from schematic.manifest.generator import ManifestGenerator from schematic.models.metadata import MetadataModel -from schematic.schemas.generator import SchemaGenerator -from schematic.schemas.explorer import SchemaExplorer + +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +#from schematic.schemas.data_model_relationships import DataModelRelationships + from schematic.store.synapse import SynapseStorage, ManifestDownload from synapseclient.core.exceptions import SynapseHTTPError, SynapseAuthenticationError, SynapseUnmetAccessRestrictions, SynapseNoCredentialsError, SynapseTimeoutError from schematic.utils.general import entity_type_mapping +from schematic.utils.schema_utils import get_property_label_from_display_name logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) @@ -193,9 +198,12 @@ def save_file(file_key="csv_file"): return temp_path def initalize_metadata_model(schema_url): - jsonld = get_temp_jsonld(schema_url) + # get path to temp data model file (csv or jsonld) as appropriate + data_model = get_temp_model_path(schema_url) + + metadata_model = MetadataModel( - inputMModelLocation=jsonld, inputMModelLocationType="local" + inputMModelLocation=data_model, inputMModelLocationType="local" ) return metadata_model @@ -208,11 +216,32 @@ def get_temp_jsonld(schema_url): # get path to temporary JSON-LD file return tmp_file.name +def get_temp_csv(schema_url): + # retrieve a CSV via URL and store it in a temporary location + with urllib.request.urlopen(schema_url) as response: + with tempfile.NamedTemporaryFile(delete=False, suffix=".model.csv") as tmp_file: + shutil.copyfileobj(response, tmp_file) + + # get path to temporary csv file + return tmp_file.name + +def get_temp_model_path(schema_url): + # Get model type: + model_extension = pathlib.Path(schema_url).suffix.replace('.', '').upper() + if model_extension == 'CSV': + temp_path = get_temp_csv(schema_url) + elif model_extension == 'JSONLD': + temp_path = get_temp_jsonld(schema_url) + else: + raise ValueError("Did not provide a valid model type CSV or JSONLD, please check submission and try again.") + return temp_path + + # @before_request def get_manifest_route(schema_url: str, use_annotations: bool, dataset_ids=None, asset_view = None, output_format=None, title=None, strict_validation:bool=True): """Get the immediate dependencies that are related to a given source node. Args: - schema_url: link to data model in json ld format + schema_url: link to data model in json ld or csv format title: title of a given manifest. dataset_id: Synapse ID of the "dataset" entity on Synapse (for a given center/project). output_format: contains three option: "excel", "google_sheet", and "dataframe". if set to "excel", return an excel spreadsheet @@ -228,9 +257,8 @@ def get_manifest_route(schema_url: str, use_annotations: bool, dataset_ids=None, # call config_handler() config_handler(asset_view = asset_view) - - # get path to temporary JSON-LD file - jsonld = get_temp_jsonld(schema_url) + + temp_path = get_temp_model_path(schema_url=schema_url) # Gather all data_types to make manifests for. all_args = connexion.request.args @@ -265,8 +293,9 @@ def get_manifest_route(schema_url: str, use_annotations: bool, dataset_ids=None, f"When submitting 'all manifests' as the data_type cannot also submit dataset_id. " f"Please check your submission and try again." ) - - all_results = ManifestGenerator.create_manifests(jsonld=jsonld, output_format=output_format, data_types=data_type, title=title, access_token=access_token, dataset_ids=dataset_ids, strict=strict_validation, use_annotations=use_annotations) + + all_results = ManifestGenerator.create_manifests(path_to_data_model=schema_url, output_format=output_format, data_types=data_type, title=title, access_token=access_token, dataset_ids=dataset_ids, strict=strict_validation, use_annotations=use_annotations) + return all_results #####profile validate manifest route function @@ -294,11 +323,11 @@ def validate_manifest_route(schema_url, data_type, restrict_rules=None, json_str else: temp_path = jsc.convert_json_file_to_csv("file_name") - # get path to temporary JSON-LD file - jsonld = get_temp_jsonld(schema_url) + # get path to temp data model file (csv or jsonld) as appropriate + data_model = get_temp_model_path(schema_url) metadata_model = MetadataModel( - inputMModelLocation=jsonld, inputMModelLocationType="local" + inputMModelLocation=data_model, inputMModelLocationType="local" ) errors, warnings = metadata_model.validateModelManifest( @@ -356,9 +385,12 @@ def submit_manifest_route(schema_url, validate_component = None else: validate_component = data_type + + # get path to temp data model file (csv or jsonld) as appropriate + data_model = get_temp_model_path(schema_url) manifest_id = metadata_model.submit_metadata_manifest( - path_to_json_ld = schema_url, + path_to_json_ld = data_model, manifest_path=temp_path, dataset_id=dataset_id, validate_component=validate_component, @@ -377,14 +409,14 @@ def populate_manifest_route(schema_url, title=None, data_type=None, return_excel # call config_handler() config_handler() - # get path to temporary JSON-LD file - jsonld = get_temp_jsonld(schema_url) - # Get path to temp file where manifest file contents will be saved temp_path = save_file() + + # get path to temp data model file (csv or jsonld) as appropriate + data_model = get_temp_model_path(schema_url) #Initalize MetadataModel - metadata_model = MetadataModel(inputMModelLocation=jsonld, inputMModelLocationType='local') + metadata_model = MetadataModel(inputMModelLocation=data_model, inputMModelLocationType='local') #Call populateModelManifest class populated_manifest_link = metadata_model.populateModelManifest(title=title, manifestPath=temp_path, rootNode=data_type, return_excel=return_excel) @@ -478,9 +510,10 @@ def get_viz_attributes_explorer(schema_url): # call config_handler() config_handler() - temp_path_to_jsonld = get_temp_jsonld(schema_url) + # get path to temp data model file (csv or jsonld) as appropriate + data_model = get_temp_model_path(schema_url) - attributes_csv = AttributesExplorer(temp_path_to_jsonld).parse_attributes(save_file=False) + attributes_csv = AttributesExplorer(data_model).parse_attributes(save_file=False) return attributes_csv @@ -488,19 +521,21 @@ def get_viz_component_attributes_explorer(schema_url, component, include_index): # call config_handler() config_handler() - temp_path_to_jsonld = get_temp_jsonld(schema_url) + # get path to temp data model file (csv or jsonld) as appropriate + data_model = get_temp_model_path(schema_url) - attributes_csv = AttributesExplorer(temp_path_to_jsonld).parse_component_attributes(component, save_file=False, include_index=include_index) + attributes_csv = AttributesExplorer(data_model).parse_component_attributes(component, save_file=False, include_index=include_index) return attributes_csv @cross_origin(["http://localhost", "https://sage-bionetworks.github.io"]) def get_viz_tangled_tree_text(schema_url, figure_type, text_format): - temp_path_to_jsonld = get_temp_jsonld(schema_url) + # get path to temp data model file (csv or jsonld) as appropriate + data_model = get_temp_model_path(schema_url) # Initialize TangledTree - tangled_tree = TangledTree(temp_path_to_jsonld, figure_type) + tangled_tree = TangledTree(data_model, figure_type) # Get text for tangled tree. text_df = tangled_tree.get_text_for_tangled_tree(text_format, save_file=False) @@ -513,10 +548,11 @@ def get_viz_tangled_tree_layers(schema_url, figure_type): # call config_handler() config_handler() - temp_path_to_jsonld = get_temp_jsonld(schema_url) + # get path to temp data model file (csv or jsonld) as appropriate + data_model = get_temp_model_path(schema_url) # Initialize Tangled Tree - tangled_tree = TangledTree(temp_path_to_jsonld, figure_type) + tangled_tree = TangledTree(data_model, figure_type) # Get tangled trees layers JSON. layers = tangled_tree.get_tangled_tree_layers(save_file=False) @@ -638,35 +674,41 @@ def get_manifest_datatype(manifest_id, asset_view): return manifest_dtypes_dict def get_schema_pickle(schema_url): - # load schema - se = SchemaExplorer() + data_model_parser = DataModelParser(path_to_data_model = schema_url) + #Parse Model + parsed_data_model = data_model_parser.parse_model() - se.load_schema(schema_url) + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) - # get schema - schema_graph = se.get_nx_schema() + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() # write to local pickle file path = os.getcwd() export_path = os.path.join(path, 'tests/data/schema.gpickle') with open(export_path, 'wb') as file: - pickle.dump(schema_graph, file) + pickle.dump(graph_data_model, file) return export_path def get_subgraph_by_edge_type(schema_url, relationship): - # use schema generator and schema explorer - sg = SchemaGenerator(path_to_json_ld=schema_url) - se = SchemaExplorer() - se.load_schema(schema_url) + data_model_parser = DataModelParser(path_to_data_model = schema_url) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() - # get the schema graph - schema_graph = se.get_nx_schema() + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) - # relationship subgraph - relationship_subgraph = sg.get_subgraph_by_edge_type(schema_graph, relationship) + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + dmge = DataModelGraphExplorer(graph_data_model) + + # relationship subgraph + relationship_subgraph = dmge.get_subgraph_by_edge_type(relationship) # return relationship Arr = [] for t in relationship_subgraph.edges: @@ -677,14 +719,20 @@ def get_subgraph_by_edge_type(schema_url, relationship): def find_class_specific_properties(schema_url, schema_class): - # use schema explorer - se = SchemaExplorer() + data_model_parser = DataModelParser(path_to_data_model = schema_url) + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) - # load schema - se.load_schema(schema_url) + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + dmge = DataModelGraphExplorer(graph_data_model) # return properties - properties = se.find_class_specific_properties(schema_class) + properties = dmge.find_class_specific_properties(schema_class) return properties @@ -712,15 +760,25 @@ def get_node_dependencies( Returns: list[str]: List of nodes that are dependent on the source node. """ - gen = SchemaGenerator(path_to_json_ld=schema_url) - dependencies = gen.get_node_dependencies( + data_model_parser = DataModelParser(path_to_data_model = schema_url) + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + dmge = DataModelGraphExplorer(graph_data_model) + + dependencies = dmge.get_node_dependencies( source_node, return_display_names, return_schema_ordered ) return dependencies -def get_property_label_from_display_name( - schema_url: str, +def get_property_label_from_display_name_route( display_name: str, strict_camel_case: bool = False ) -> str: @@ -735,9 +793,7 @@ def get_property_label_from_display_name( Returns: str: The property label of the display name """ - explorer = SchemaExplorer() - explorer.load_schema(schema_url) - label = explorer.get_property_label_from_display_name(display_name, strict_camel_case) + label = get_property_label_from_display_name(display_name=display_name, strict_camel_case=strict_camel_case) return label @@ -757,8 +813,19 @@ def get_node_range( Returns: list[str]: A list of nodes """ - gen = SchemaGenerator(path_to_json_ld=schema_url) - node_range = gen.get_node_range(node_label, return_display_names) + data_model_parser = DataModelParser(path_to_data_model = schema_url) + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + dmge = DataModelGraphExplorer(graph_data_model) + + node_range = dmge.get_node_range(node_label, return_display_names) return node_range def get_if_node_required(schema_url: str, node_display_name: str) -> bool: @@ -772,8 +839,19 @@ def get_if_node_required(schema_url: str, node_display_name: str) -> bool: True: If the given node is a "required" node. False: If the given node is not a "required" (i.e., an "optional") node. """ - gen = SchemaGenerator(path_to_json_ld=schema_url) - is_required = gen.is_node_required(node_display_name) + data_model_parser = DataModelParser(path_to_data_model = schema_url) + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + dmge = DataModelGraphExplorer(graph_data_model) + + is_required = dmge.get_node_required(node_display_name) return is_required @@ -785,8 +863,22 @@ def get_node_validation_rules(schema_url: str, node_display_name: str) -> list: Returns: List of valiation rules for a given node. """ - gen = SchemaGenerator(path_to_json_ld=schema_url) - node_validation_rules = gen.get_node_validation_rules(node_display_name) + # Instantiate DataModelParser + data_model_parser = DataModelParser(path_to_data_model = schema_url) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + #Instantiate DataModelGraphExplorer + dmge = DataModelGraphExplorer(graph_data_model) + + node_validation_rules = dmge.get_node_validation_rules(node_display_name) return node_validation_rules @@ -801,9 +893,22 @@ def get_nodes_display_names(schema_url: str, node_list: list[str]) -> list: node_display_names (List[str]): List of node display names. """ - gen = SchemaGenerator(path_to_json_ld=schema_url) - mm_graph = gen.se.get_nx_schema() - node_display_names = gen.get_nodes_display_names(node_list, mm_graph) + # Instantiate DataModelParser + data_model_parser = DataModelParser(path_to_data_model = schema_url) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + #Instantiate DataModelGraphExplorer + dmge = DataModelGraphExplorer(graph_data_model) + + node_display_names = dmge.get_nodes_display_names(node_list) return node_display_names def get_schematic_version() -> str: diff --git a/tests/conftest.py b/tests/conftest.py index fa0fb421e..55d100310 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,14 +9,17 @@ from dotenv import load_dotenv, find_dotenv from time import perf_counter -from schematic.schemas.explorer import SchemaExplorer +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_nodes import DataModelNodes +from schematic.schemas.data_model_json_schema import DataModelJSONSchema + from schematic.configuration.configuration import CONFIG from schematic.utils.df_utils import load_df from schematic.store.synapse import SynapseStorage load_dotenv() - logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -54,15 +57,29 @@ def get_data_frame(path, *paths, **kwargs): return load_df(fullpath, **kwargs) @staticmethod - def get_schema_explorer(path=None, *paths): + def get_data_model_graph_explorer(path=None, *paths): + #commenting this now bc we dont want to have multiple instances if path is None: - return SchemaExplorer() + return fullpath = Helpers.get_data_path(path, *paths) - se = SchemaExplorer() - se.load_schema(fullpath) - return se + # Instantiate DataModelParser + data_model_parser = DataModelParser(path_to_data_model = fullpath) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + #Instantiate DataModelGraphExplorer + DMGE = DataModelGraphExplorer(graph_data_model) + + return DMGE @staticmethod def get_python_version(): diff --git a/tests/data/example.model.csv b/tests/data/example.model.csv index f88948758..6858e509c 100644 --- a/tests/data/example.model.csv +++ b/tests/data/example.model.csv @@ -42,4 +42,4 @@ Check Date,,,,,TRUE,DataProperty,,,date Check NA,,,,,TRUE,DataProperty,,,int::IsNA MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,, MockRDB_id,,,,,TRUE,DataProperty,,,int -SourceManifest,,,,,TRUE,DataProperty,,, \ No newline at end of file +SourceManifest,,,,,TRUE,DataProperty,,, diff --git a/tests/data/example.model.jsonld b/tests/data/example.model.jsonld index 6666bb09e..a58d36323 100644 --- a/tests/data/example.model.jsonld +++ b/tests/data/example.model.jsonld @@ -7,1971 +7,6 @@ "xsd": "http://www.w3.org/2001/XMLSchema#" }, "@graph": [ - { - "@id": "schema:Text", - "@type": [ - "schema:DataType", - "rdfs:Class" - ], - "rdfs:comment": "Data type: Text.", - "rdfs:label": "Text" - }, - { - "@id": "schema:Number", - "@type": [ - "schema:DataType", - "rdfs:Class" - ], - "rdfs:comment": "Data type: Number.", - "rdfs:label": "Number" - }, - { - "@id": "schema:Integer", - "@type": "rdfs:Class", - "rdfs:comment": "Data type: Integer.", - "rdfs:label": "Integer", - "rdfs:subClassOf": { - "@id": "schema:Number" - } - }, - { - "@id": "schema:Thing", - "@type": "rdfs:Class", - "rdfs:comment": "Thing", - "rdfs:label": "Thing", - "schema:isPartOf": { - "@id": "http://schema.org" - } - }, - { - "@id": "bts:BiologicalEntity", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "BiologicalEntity", - "rdfs:subClassOf": { - "@id": "schema:Thing" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:OntologyClass", - "@type": "rdfs:Class", - "rdfs:comment": "a concept or class in an ontology, vocabulary or thesaurus", - "rdfs:label": "OntologyClass", - "rdfs:subClassOf": { - "@id": "schema:Thing" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:RelationshipType", - "@type": "rdfs:Class", - "rdfs:comment": "An OWL property used as an edge label", - "rdfs:label": "RelationshipType", - "rdfs:subClassOf": { - "@id": "bts:OntologyClass" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:GeneOntologyClass", - "@type": "rdfs:Class", - "rdfs:comment": "an ontology class that describes a functional aspect of a gene, gene prodoct or complex", - "rdfs:label": "GeneOntologyClass", - "rdfs:subClassOf": { - "@id": "bts:OntologyClass" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:OrganismTaxon", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "OrganismTaxon", - "rdfs:subClassOf": { - "@id": "bts:OntologyClass" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:OrganismalEntity", - "@type": "rdfs:Class", - "rdfs:comment": "A named entity that is either a part of an organism, a whole organism, population or clade of organisms, excluding molecular entities", - "rdfs:label": "OrganismalEntity", - "rdfs:subClassOf": { - "@id": "bts:BiologicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:IndividualOrganism", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "IndividualOrganism", - "rdfs:subClassOf": { - "@id": "bts:OrganismalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Case", - "@type": "rdfs:Class", - "rdfs:comment": "An individual organism that has a patient role in some clinical context.", - "rdfs:label": "Case", - "rdfs:subClassOf": { - "@id": "bts:IndividualOrganism" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:PopulationOfIndividualOrganisms", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "PopulationOfIndividualOrganisms", - "rdfs:subClassOf": { - "@id": "bts:OrganismalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Biosample", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "Biosample", - "rdfs:subClassOf": { - "@id": "bts:OrganismalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:DiseaseOrPhenotypicFeature", - "@type": "rdfs:Class", - "rdfs:comment": "Either one of a disease or an individual phenotypic feature. Some knowledge resources such as Monarch treat these as distinct, others such as MESH conflate.", - "rdfs:label": "DiseaseOrPhenotypicFeature", - "rdfs:subClassOf": { - "@id": "bts:BiologicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Disease", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "Disease", - "rdfs:subClassOf": { - "@id": "bts:DiseaseOrPhenotypicFeature" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:PhenotypicFeature", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "PhenotypicFeature", - "rdfs:subClassOf": { - "@id": "bts:DiseaseOrPhenotypicFeature" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Environment", - "@type": "rdfs:Class", - "rdfs:comment": "A feature of the environment of an organism that influences one or more phenotypic features of that organism, potentially mediated by genes", - "rdfs:label": "Environment", - "rdfs:subClassOf": { - "@id": "bts:BiologicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:InformationContentEntity", - "@type": "rdfs:Class", - "rdfs:comment": "a piece of information that typically describes some piece of biology or is used as support.", - "rdfs:label": "InformationContentEntity", - "rdfs:subClassOf": { - "@id": "schema:Thing" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:ConfidenceLevel", - "@type": "rdfs:Class", - "rdfs:comment": "Level of confidence in a statement", - "rdfs:label": "ConfidenceLevel", - "rdfs:subClassOf": { - "@id": "bts:InformationContentEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:EvidenceType", - "@type": "rdfs:Class", - "rdfs:comment": "Class of evidence that supports an association", - "rdfs:label": "EvidenceType", - "rdfs:subClassOf": { - "@id": "bts:InformationContentEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Publication", - "@type": "rdfs:Class", - "rdfs:comment": "Any published piece of information. Can refer to a whole publication, or to a part of it (e.g. a figure, figure legend, or section highlighted by NLP). The scope is intended to be general and include information published on the web as well as journals.", - "rdfs:label": "Publication", - "rdfs:subClassOf": { - "@id": "bts:InformationContentEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:MolecularEntity", - "@type": "rdfs:Class", - "rdfs:comment": "A gene, gene product, small molecule or macromolecule (including protein complex)", - "rdfs:label": "MolecularEntity", - "rdfs:subClassOf": { - "@id": "bts:BiologicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:ChemicalSubstance", - "@type": "rdfs:Class", - "rdfs:comment": "May be a chemical entity or a formulation with a chemical entity as active ingredient, or a complex material with multiple chemical entities as part", - "rdfs:label": "ChemicalSubstance", - "rdfs:subClassOf": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Drug", - "@type": "rdfs:Class", - "rdfs:comment": "A substance intended for use in the diagnosis, cure, mitigation, treatment, or prevention of disease", - "rdfs:label": "Drug", - "rdfs:subClassOf": { - "@id": "bts:ChemicalSubstance" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Metabolite", - "@type": "rdfs:Class", - "rdfs:comment": "Any intermediate or product resulting from metabolism. Includes primary and secondary metabolites.", - "rdfs:label": "Metabolite", - "rdfs:subClassOf": { - "@id": "bts:ChemicalSubstance" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:AnatomicalEntity", - "@type": "rdfs:Class", - "rdfs:comment": "A subcellular location, cell type or gross anatomical part", - "rdfs:label": "AnatomicalEntity", - "rdfs:subClassOf": { - "@id": "bts:OrganismalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:LifeStage", - "@type": "rdfs:Class", - "rdfs:comment": "A stage of development or growth of an organism, including post-natal adult stages", - "rdfs:label": "LifeStage", - "rdfs:subClassOf": { - "@id": "bts:OrganismalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:PlanetaryEntity", - "@type": "rdfs:Class", - "rdfs:comment": "Any entity or process that exists at the level of the whole planet", - "rdfs:label": "PlanetaryEntity", - "rdfs:subClassOf": { - "@id": "schema:Thing" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:EnvironmentalProcess", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "EnvironmentalProcess", - "rdfs:subClassOf": { - "@id": "bts:PlanetaryEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:EnvironmentalFeature", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "EnvironmentalFeature", - "rdfs:subClassOf": { - "@id": "bts:PlanetaryEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:ClinicalEntity", - "@type": "rdfs:Class", - "rdfs:comment": "Any entity or process that exists in the clinical domain and outside the biological realm. Diseases are placed under biological entities", - "rdfs:label": "ClinicalEntity", - "rdfs:subClassOf": { - "@id": "schema:Thing" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:ClinicalTrial", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "ClinicalTrial", - "rdfs:subClassOf": { - "@id": "bts:ClinicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:ClinicalIntervention", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "ClinicalIntervention", - "rdfs:subClassOf": { - "@id": "bts:ClinicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Device", - "@type": "rdfs:Class", - "rdfs:comment": "A thing made or adapted for a particular purpose, especially a piece of mechanical or electronic equipment", - "rdfs:label": "Device", - "rdfs:subClassOf": { - "@id": "schema:Thing" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:GenomicEntity", - "@type": "rdfs:Class", - "rdfs:comment": "an entity that can either be directly located on a genome (gene, transcript, exon, regulatory region) or is encoded in a genome (protein)", - "rdfs:label": "GenomicEntity", - "rdfs:subClassOf": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Genome", - "@type": "rdfs:Class", - "rdfs:comment": "A genome is the sum of genetic material within a cell or virion.", - "rdfs:label": "Genome", - "rdfs:subClassOf": { - "@id": "bts:GenomicEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Transcript", - "@type": "rdfs:Class", - "rdfs:comment": "An RNA synthesized on a DNA or RNA template by an RNA polymerase", - "rdfs:label": "Transcript", - "rdfs:subClassOf": { - "@id": "bts:GenomicEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Exon", - "@type": "rdfs:Class", - "rdfs:comment": "A region of the transcript sequence within a gene which is not removed from the primary RNA transcript by RNA splicing", - "rdfs:label": "Exon", - "rdfs:subClassOf": { - "@id": "bts:GenomicEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:CodingSequence", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "CodingSequence", - "rdfs:subClassOf": { - "@id": "bts:GenomicEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:MacromolecularMachine", - "@type": "rdfs:Class", - "rdfs:comment": "A union of gene, gene product, and macromolecular complex. These are the basic units of function in a cell. They either carry out individual biological activities, or they encode molecules which do this.", - "rdfs:label": "MacromolecularMachine", - "rdfs:subClassOf": { - "@id": "bts:GenomicEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:GeneOrGeneProduct", - "@type": "rdfs:Class", - "rdfs:comment": "a union of genes or gene products. Frequently an identifier for one will be used as proxy for another", - "rdfs:label": "GeneOrGeneProduct", - "rdfs:subClassOf": { - "@id": "bts:MacromolecularMachine" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Gene", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "Gene", - "rdfs:subClassOf": { - "@id": "bts:GeneOrGeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:GeneProduct", - "@type": "rdfs:Class", - "rdfs:comment": "The functional molecular product of a single gene. Gene products are either proteins or functional RNA molecules", - "rdfs:label": "GeneProduct", - "rdfs:subClassOf": { - "@id": "bts:GeneOrGeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Protein", - "@type": "rdfs:Class", - "rdfs:comment": "A gene product that is composed of a chain of amino acid sequences and is produced by ribosome-mediated translation of mRNA", - "rdfs:label": "Protein", - "rdfs:subClassOf": { - "@id": "bts:GeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:GeneProductIsoform", - "@type": "rdfs:Class", - "rdfs:comment": "This is an abstract class that can be mixed in with different kinds of gene products to indicate that the gene product is intended to represent a specific isoform rather than a canonical or reference or generic product. The designation of canonical or reference may be arbitrary, or it may represent the superclass of all isoforms.", - "rdfs:label": "GeneProductIsoform", - "rdfs:subClassOf": { - "@id": "bts:GeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:ProteinIsoform", - "@type": "rdfs:Class", - "rdfs:comment": "Represents a protein that is a specific isoform of the canonical or reference protein. See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4114032/", - "rdfs:label": "ProteinIsoform", - "rdfs:subClassOf": { - "@id": "bts:Protein" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:RnaProduct", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "RnaProduct", - "rdfs:subClassOf": { - "@id": "bts:GeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:RnaProductIsoform", - "@type": "rdfs:Class", - "rdfs:comment": "Represents a protein that is a specific isoform of the canonical or reference RNA", - "rdfs:label": "RnaProductIsoform", - "rdfs:subClassOf": { - "@id": "bts:RnaProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:NoncodingRnaProduct", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "NoncodingRnaProduct", - "rdfs:subClassOf": { - "@id": "bts:RnaProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Microrna", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "Microrna", - "rdfs:subClassOf": { - "@id": "bts:NoncodingRnaProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:MacromolecularComplex", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "MacromolecularComplex", - "rdfs:subClassOf": { - "@id": "bts:MacromolecularMachine" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:GeneFamily", - "@type": "rdfs:Class", - "rdfs:comment": "any grouping of multiple genes or gene products related by common descent", - "rdfs:label": "GeneFamily", - "rdfs:subClassOf": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Genotype", - "@type": "rdfs:Class", - "rdfs:comment": "An information content entity that describes a genome by specifying the total variation in genomic sequence and/or gene expression, relative to some extablished background", - "rdfs:label": "Genotype", - "rdfs:subClassOf": { - "@id": "bts:GenomicEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Haplotype", - "@type": "rdfs:Class", - "rdfs:comment": "A set of zero or more Alleles on a single instance of a Sequence[VMC]", - "rdfs:label": "Haplotype", - "rdfs:subClassOf": { - "@id": "bts:GenomicEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:SequenceVariant", - "@type": "rdfs:Class", - "rdfs:comment": "An allele that varies in its sequence from what is considered the reference allele at that locus.", - "rdfs:label": "SequenceVariant", - "rdfs:subClassOf": { - "@id": "bts:GenomicEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:DrugExposure", - "@type": "rdfs:Class", - "rdfs:comment": "A drug exposure is an intake of a particular chemical substance", - "rdfs:label": "DrugExposure", - "rdfs:subClassOf": { - "@id": "bts:Environment" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Treatment", - "@type": "rdfs:Class", - "rdfs:comment": "A treatment is targeted at a disease or phenotype and may involve multiple drug 'exposures'", - "rdfs:label": "Treatment", - "rdfs:subClassOf": { - "@id": "bts:Environment" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:GeographicLocation", - "@type": "rdfs:Class", - "rdfs:comment": "a location that can be described in lat/long coordinates", - "rdfs:label": "GeographicLocation", - "rdfs:subClassOf": { - "@id": "bts:PlanetaryEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:GeographicLocationAtTime", - "@type": "rdfs:Class", - "rdfs:comment": "a location that can be described in lat/long coordinates, for a particular time", - "rdfs:label": "GeographicLocationAtTime", - "rdfs:subClassOf": { - "@id": "bts:GeographicLocation" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Occurrent", - "@type": "rdfs:Class", - "rdfs:comment": "A processual entity", - "rdfs:label": "Occurrent", - "rdfs:subClassOf": { - "@id": "schema:Thing" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:BiologicalProcessOrActivity", - "@type": "rdfs:Class", - "rdfs:comment": "Either an individual molecular activity, or a collection of causally connected molecular activities", - "rdfs:label": "BiologicalProcessOrActivity", - "rdfs:subClassOf": { - "@id": "bts:BiologicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:MolecularActivity", - "@type": "rdfs:Class", - "rdfs:comment": "An execution of a molecular function carried out by a gene product or macromolecular complex.", - "rdfs:label": "MolecularActivity", - "rdfs:subClassOf": { - "@id": "bts:BiologicalProcessOrActivity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:ActivityAndBehavior", - "@type": "rdfs:Class", - "rdfs:comment": "Activity or behavior of any independent integral living, organization or mechanical actor in the world", - "rdfs:label": "ActivityAndBehavior", - "rdfs:subClassOf": { - "@id": "bts:Occurrent" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Procedure", - "@type": "rdfs:Class", - "rdfs:comment": "A series of actions conducted in a certain order or manner", - "rdfs:label": "Procedure", - "rdfs:subClassOf": { - "@id": "bts:Occurrent" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Phenomenon", - "@type": "rdfs:Class", - "rdfs:comment": "a fact or situation that is observed to exist or happen, especially one whose cause or explanation is in question", - "rdfs:label": "Phenomenon", - "rdfs:subClassOf": { - "@id": "bts:Occurrent" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:BiologicalProcess", - "@type": "rdfs:Class", - "rdfs:comment": "One or more causally connected executions of molecular functions", - "rdfs:label": "BiologicalProcess", - "rdfs:subClassOf": { - "@id": "bts:BiologicalProcessOrActivity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Pathway", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "Pathway", - "rdfs:subClassOf": { - "@id": "bts:BiologicalProcess" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:PhysiologicalProcess", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "PhysiologicalProcess", - "rdfs:subClassOf": { - "@id": "bts:BiologicalProcess" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:CellularComponent", - "@type": "rdfs:Class", - "rdfs:comment": "A location in or around a cell", - "rdfs:label": "CellularComponent", - "rdfs:subClassOf": { - "@id": "bts:AnatomicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:Cell", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "Cell", - "rdfs:subClassOf": { - "@id": "bts:AnatomicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:CellLine", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "CellLine", - "rdfs:subClassOf": { - "@id": "bts:Biosample" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:GrossAnatomicalStructure", - "@type": "rdfs:Class", - "rdfs:comment": null, - "rdfs:label": "GrossAnatomicalStructure", - "rdfs:subClassOf": { - "@id": "bts:AnatomicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - } - }, - { - "@id": "bts:ensembl", - "@type": "rdf:Property", - "rdfs:comment": "Ensembl ID for gene, protein or transcript", - "rdfs:label": "ensembl", - "schema:domainIncludes": [ - { - "@id": "bts:Transcript" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "schema:Text" - } - }, - { - "@id": "bts:hgnc", - "@type": "rdf:Property", - "rdfs:comment": "HGNC ID for gene", - "rdfs:label": "hgnc", - "schema:domainIncludes": { - "@id": "bts:Gene" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "schema:Integer" - } - }, - { - "@id": "bts:entrez", - "@type": "rdf:Property", - "rdfs:comment": "Entrez ID for gene", - "rdfs:label": "entrez", - "schema:domainIncludes": { - "@id": "bts:Gene" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "schema:Integer" - } - }, - { - "@id": "bts:refseq", - "@type": "rdf:Property", - "rdfs:comment": "Refseq ID for gene, protein or transcript", - "rdfs:label": "refseq", - "schema:domainIncludes": [ - { - "@id": "bts:Transcript" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "schema:Text" - } - }, - { - "@id": "bts:omim", - "@type": "rdf:Property", - "rdfs:comment": "Refseq ID for gene, protein or transcript", - "rdfs:label": "omim", - "schema:domainIncludes": [ - { - "@id": "bts:Disease" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "schema:Integer" - } - }, - { - "@id": "bts:umls", - "@type": "rdf:Property", - "rdfs:comment": "Refseq ID for gene, protein or transcript", - "rdfs:label": "umls", - "schema:domainIncludes": { - "@id": "bts:Disease" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "schema:Text" - } - }, - { - "@id": "bts:homologousTo", - "@type": "rdf:Property", - "rdfs:comment": "Shared ancestry between protein or gene", - "rdfs:label": "homologousTo", - "schema:domainIncludes": { - "@id": "bts:GeneOrGeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GeneOrGeneProduct" - } - }, - { - "@id": "bts:molecularlyInteractsWith", - "@type": "rdf:Property", - "rdfs:comment": null, - "rdfs:label": "molecularlyInteractsWith", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:geneticallyInteractsWith", - "@type": "rdf:Property", - "rdfs:comment": "holds between two genes whose phenotypic effects are dependent on each other in some way - such that their combined phenotypic effects are the result of some interaction between the activity of their gene products. Examples include epistasis and synthetic lethality.", - "rdfs:label": "geneticallyInteractsWith", - "schema:domainIncludes": { - "@id": "bts:Gene" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:Gene" - } - }, - { - "@id": "bts:affectsAbundanceOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one changes the amount of the other within a system of interest", - "rdfs:label": "affectsAbundanceOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesAbundanceOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the amount of the other within a system of interest", - "rdfs:label": "increasesAbundanceOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesAbundanceOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the amount of the other within a system of interest", - "rdfs:label": "decreasesAbundanceOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsActivityOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one changes the activity of the other within a system of interest", - "rdfs:label": "affectsActivityOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesActivityOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the activity of the other within a system of interest", - "rdfs:label": "increasesActivityOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesActivityOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the activity of the other within a system of interest", - "rdfs:label": "decreasesActivityOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsExpressionOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one changes the level of expression of the other within a system of interest", - "rdfs:label": "affectsExpressionOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GenomicEntity" - } - }, - { - "@id": "bts:increasesExpressionOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the level of expression of the other within a system of interest", - "rdfs:label": "increasesExpressionOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GenomicEntity" - } - }, - { - "@id": "bts:decreasesExpressionOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the level of expression of the other within a system of interest", - "rdfs:label": "decreasesExpressionOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GenomicEntity" - } - }, - { - "@id": "bts:affectsFoldingOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one changes the rate or quality of folding of the other ", - "rdfs:label": "affectsFoldingOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesFoldingOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the rate or quality of folding of the other ", - "rdfs:label": "increasesFoldingOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesFoldingOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the rate or quality of folding of the other ", - "rdfs:label": "decreasesFoldingOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsLocalizationOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one changes the localization of the other within a system of interest", - "rdfs:label": "affectsLocalizationOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesLocalizationOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the proper localization of the other within a system of interest", - "rdfs:label": "increasesLocalizationOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesLocalizationOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the proper localization of the other within a system of interest", - "rdfs:label": "decreasesLocalizationOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsMetabolicProcessingOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one impacts the metabolic processing of the other within a system of interest", - "rdfs:label": "affectsMetabolicProcessingOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesMetabolicProcessingOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the rate of metabolic processing of the other within a system of interest", - "rdfs:label": "increasesMetabolicProcessingOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesMetabolicProcessingOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the rate of metabolic processing of the other within a system of interest", - "rdfs:label": "decreasesMetabolicProcessingOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsMolecularModificationOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one leads changes in the molecular modification(s) of the other (e.g. via post-translational modifications of proteins such as the addition of phosphoryl group, or via redox reaction that adds or subtracts electrons)", - "rdfs:label": "affectsMolecularModificationOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesMolecularModificationOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one leads to increased molecular modification(s) of the other (e.g. via post-translational modifications of proteins such as the addition of phosphoryl group, or via redox reaction that adds or subtracts electrons)", - "rdfs:label": "increasesMolecularModificationOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesMolecularModificationOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one leads to decreased molecular modification(s) of the other (e.g. via post-translational modifications of proteins such as the addition of phosphoryl group, or via redox reaction that adds or subtracts electrons)", - "rdfs:label": "decreasesMolecularModificationOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsSynthesisOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one impacts the rate of chemical synthesis of the other", - "rdfs:label": "affectsSynthesisOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesSynthesisOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the rate of chemical synthesis of the other", - "rdfs:label": "increasesSynthesisOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesSynthesisOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the rate of chemical synthesis of the other", - "rdfs:label": "decreasesSynthesisOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsDegradationOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one impacts the rate of degradation of the other within a system of interest", - "rdfs:label": "affectsDegradationOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesDegradationOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the rate of degradation of the other within a system of interest", - "rdfs:label": "increasesDegradationOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesDegradationOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the rate of degradation of the other within a system of interest", - "rdfs:label": "decreasesDegradationOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsMutationRateOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between a molecular entity and a genomic entity where the action or effect of the molecular entity impacts the rate of mutation of the genomic entity within a system of interest", - "rdfs:label": "affectsMutationRateOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GenomicEntity" - } - }, - { - "@id": "bts:increasesMutationRateOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between a molecular entity and a genomic entity where the action or effect of the molecular entity increases the rate of mutation of the genomic entity within a system of interest", - "rdfs:label": "increasesMutationRateOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GenomicEntity" - } - }, - { - "@id": "bts:decreasesMutationRateOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between a molecular entity and a genomic entity where the action or effect of the molecular entity decreases the rate of mutation of the genomic entity within a system of interest", - "rdfs:label": "decreasesMutationRateOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GenomicEntity" - } - }, - { - "@id": "bts:affectsResponseTo", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one impacts the susceptibility of a biological entity or system (e.g. an organism, cell, cellular component, macromolecular machine, biological or pathological process) to the other", - "rdfs:label": "affectsResponseTo", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesResponseTo", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the susceptibility of a biological entity or system (e.g. an organism, cell, cellular component, macromolecular machine, biological or pathological process) to the other", - "rdfs:label": "increasesResponseTo", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesResponseTo", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the susceptibility of a biological entity or system (e.g. an organism, cell, cellular component, macromolecular machine, biological or pathological process) to the other", - "rdfs:label": "decreasesResponseTo", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsSplicingOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between a molecular entity and an mRNA where the action or effect of the molecular entity impacts the splicing of the mRNA", - "rdfs:label": "affectsSplicingOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:Transcript" - } - }, - { - "@id": "bts:increasesSplicingOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between a molecular entity and an mRNA where the action or effect of the molecular entity increases the proper splicing of the mRNA", - "rdfs:label": "increasesSplicingOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:Transcript" - } - }, - { - "@id": "bts:decreasesSplicingOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between a molecular entity and an mRNA where the action or effect of the molecular entity decreases the proper splicing of the mRNA", - "rdfs:label": "decreasesSplicingOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:Transcript" - } - }, - { - "@id": "bts:affectsStabilityOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one impacts the stability of the other within a system of interest", - "rdfs:label": "affectsStabilityOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesStabilityOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the stability of the other within a system of interest", - "rdfs:label": "increasesStabilityOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesStabilityOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the stability of the other within a system of interest", - "rdfs:label": "decreasesStabilityOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsTransportOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one impacts the rate of transport of the other across some boundary in a system of interest", - "rdfs:label": "affectsTransportOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesTransportOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the rate of transport of the other across some boundary in a system of interest", - "rdfs:label": "increasesTransportOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesTransportOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the rate of transport of the other across some boundary in a system of interest", - "rdfs:label": "decreasesTransportOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsSecretionOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one impacts the rate of secretion of the other out of a cell, gland, or organ", - "rdfs:label": "affectsSecretionOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesSecretionOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the rate of secretion of the other out of a cell, gland, or organ", - "rdfs:label": "increasesSecretionOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesSecretionOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the rate of secretion of the other out of a cell, gland, or organ", - "rdfs:label": "decreasesSecretionOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:affectsUptakeOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one impacts the rate of uptake of the other into of a cell, gland, or organ", - "rdfs:label": "affectsUptakeOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:increasesUptakeOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one increases the rate of uptake of the other into of a cell, gland, or organ", - "rdfs:label": "increasesUptakeOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:decreasesUptakeOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two molecular entities where the action or effect of one decreases the rate of uptake of the other into of a cell, gland, or organ", - "rdfs:label": "decreasesUptakeOf", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:regulates,ProcessToProcess", - "@type": "rdf:Property", - "rdfs:comment": null, - "rdfs:label": "regulates,ProcessToProcess", - "schema:domainIncludes": { - "@id": "bts:Occurrent" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:Occurrent" - } - }, - { - "@id": "bts:regulates,EntityToEntity", - "@type": "rdf:Property", - "rdfs:comment": null, - "rdfs:label": "regulates,EntityToEntity", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:hasGeneProduct", - "@type": "rdf:Property", - "rdfs:comment": "holds between a gene and a transcribed and/or translated product generated from it", - "rdfs:label": "hasGeneProduct", - "schema:domainIncludes": { - "@id": "bts:Gene" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GeneProduct" - } - }, - { - "@id": "bts:inPathwayWith", - "@type": "rdf:Property", - "rdfs:comment": "holds between two genes or gene products that are part of in the same biological pathway", - "rdfs:label": "inPathwayWith", - "schema:domainIncludes": { - "@id": "bts:GeneOrGeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GeneOrGeneProduct" - } - }, - { - "@id": "bts:inComplexWith", - "@type": "rdf:Property", - "rdfs:comment": "holds between two genes or gene products that are part of (or code for products that are part of) in the same macromolecular complex", - "rdfs:label": "inComplexWith", - "schema:domainIncludes": { - "@id": "bts:GeneOrGeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GeneOrGeneProduct" - } - }, - { - "@id": "bts:inCellPopulationWith", - "@type": "rdf:Property", - "rdfs:comment": "holds between two genes or gene products that are expressed in the same cell type or population ", - "rdfs:label": "inCellPopulationWith", - "schema:domainIncludes": { - "@id": "bts:GeneOrGeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GeneOrGeneProduct" - } - }, - { - "@id": "bts:geneAssociatedWithCondition", - "@type": "rdf:Property", - "rdfs:comment": "holds between a gene and a disease or phenotypic feature that the gene or its alleles/products may influence, contribute to, or correlate with", - "rdfs:label": "geneAssociatedWithCondition", - "schema:domainIncludes": { - "@id": "bts:Gene" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:DiseaseOrPhenotypicFeature" - } - }, - { - "@id": "bts:treats", - "@type": "rdf:Property", - "rdfs:comment": "holds between a therapeutic procedure or chemical substance and a disease or phenotypic feature that it is used to treat", - "rdfs:label": "treats", - "schema:domainIncludes": { - "@id": "bts:Treatment" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:DiseaseOrPhenotypicFeature" - } - }, - { - "@id": "bts:correlatedWith", - "@type": "rdf:Property", - "rdfs:comment": "holds between a disease or phenotypic feature and a measurable molecular entity that is used as an indicator of the presence or state of the disease or feature.", - "rdfs:label": "correlatedWith", - "schema:domainIncludes": { - "@id": "bts:DiseaseOrPhenotypicFeature" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:hasBiomarker", - "@type": "rdf:Property", - "rdfs:comment": "holds between a disease or phenotypic feature and a measurable molecular entity that is used as an indicator of the presence or state of the disease or feature.", - "rdfs:label": "hasBiomarker", - "schema:domainIncludes": { - "@id": "bts:DiseaseOrPhenotypicFeature" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:MolecularEntity" - } - }, - { - "@id": "bts:biomarkerFor", - "@type": "rdf:Property", - "rdfs:comment": "holds between a measurable molecular entity and a disease or phenotypic feature, where the entity is used as an indicator of the presence or state of the disease or feature.", - "rdfs:label": "biomarkerFor", - "schema:domainIncludes": { - "@id": "bts:MolecularEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:DiseaseOrPhenotypicFeature" - } - }, - { - "@id": "bts:expressedIn", - "@type": "rdf:Property", - "rdfs:comment": "holds between a gene or gene product and an anatomical entity in which it is expressed", - "rdfs:label": "expressedIn", - "schema:domainIncludes": { - "@id": "bts:GeneOrGeneProduct" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:AnatomicalEntity" - } - }, - { - "@id": "bts:expresses", - "@type": "rdf:Property", - "rdfs:comment": "holds between an anatomical entity and gene or gene product that is expressed there", - "rdfs:label": "expresses", - "schema:domainIncludes": { - "@id": "bts:AnatomicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:GeneOrGeneProduct" - } - }, - { - "@id": "bts:hasPhenotype", - "@type": "rdf:Property", - "rdfs:comment": "holds between a biological entity and a phenotype, where a phenotype is construed broadly as any kind of quality of an organism part, a collection of these qualities, or a change in quality or qualities (e.g. abnormally increased temperature). ", - "rdfs:label": "hasPhenotype", - "schema:domainIncludes": { - "@id": "bts:BiologicalEntity" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:DiseaseOrPhenotypicFeature" - } - }, - { - "@id": "bts:precedes", - "@type": "rdf:Property", - "rdfs:comment": "holds between two processes, where one completes before the other begins", - "rdfs:label": "precedes", - "schema:domainIncludes": { - "@id": "bts:Occurrent" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:Occurrent" - } - }, - { - "@id": "bts:subclassOf", - "@type": "rdf:Property", - "rdfs:comment": "holds between two classes where the domain class is a specialization of the range class", - "rdfs:label": "subclassOf", - "schema:domainIncludes": { - "@id": "bts:OntologyClass" - }, - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": { - "@id": "bts:OntologyClass" - } - }, { "@id": "bts:Component", "@type": "rdfs:Class", @@ -1979,7 +14,7 @@ "rdfs:label": "Component", "rdfs:subClassOf": [ { - "@id": "schema:Thing" + "@id": "bts:Thing" } ], "schema:isPartOf": { @@ -2024,90 +59,195 @@ "sms:validationRules": [] }, { - "@id": "bts:PatientID", + "@id": "bts:PatientID", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "PatientID", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Patient ID", + "sms:required": "sms:true", + "sms:validationRules": [] + }, + { + "@id": "bts:Sex", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "Sex", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "schema:rangeIncludes": [ + { + "@id": "bts:Female" + }, + { + "@id": "bts:Male" + }, + { + "@id": "bts:Other" + } + ], + "sms:displayName": "Sex", + "sms:required": "sms:true", + "sms:validationRules": [] + }, + { + "@id": "bts:YearofBirth", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "YearofBirth", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Year of Birth", + "sms:required": "sms:false", + "sms:validationRules": [] + }, + { + "@id": "bts:Diagnosis", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "Diagnosis", + "rdfs:subClassOf": [ + { + "@id": "bts:DataProperty" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "schema:rangeIncludes": [ + { + "@id": "bts:Healthy" + }, + { + "@id": "bts:Cancer" + } + ], + "sms:displayName": "Diagnosis", + "sms:required": "sms:true", + "sms:validationRules": [] + }, + { + "@id": "bts:DataType", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "PatientID", + "rdfs:label": "DataType", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:Thing" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Patient ID", - "sms:required": "sms:true", + "sms:displayName": "DataType", + "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:Sex", + "@id": "bts:DataProperty", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Sex", + "rdfs:label": "DataProperty", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:Thing" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "schema:rangeIncludes": [ - { - "@id": "bts:Female" - }, - { - "@id": "bts:Male" - }, + "sms:displayName": "DataProperty", + "sms:required": "sms:false", + "sms:validationRules": [] + }, + { + "@id": "bts:Female", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "Female", + "rdfs:subClassOf": [ { - "@id": "bts:Other" + "@id": "bts:Sex" } ], - "sms:displayName": "Sex", - "sms:required": "sms:true", + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Female", + "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:YearofBirth", + "@id": "bts:Male", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "YearofBirth", + "rdfs:label": "Male", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:Sex" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Year of Birth", + "sms:displayName": "Male", "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:Diagnosis", + "@id": "bts:Other", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Diagnosis", + "rdfs:label": "Other", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:Sex" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "schema:rangeIncludes": [ + "sms:displayName": "Other", + "sms:required": "sms:false", + "sms:validationRules": [] + }, + { + "@id": "bts:Healthy", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "Healthy", + "rdfs:subClassOf": [ { - "@id": "bts:Healthy" + "@id": "bts:Diagnosis" }, { - "@id": "bts:Cancer" + "@id": "bts:TissueStatus" } ], - "sms:displayName": "Diagnosis", - "sms:required": "sms:true", + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Healthy", + "sms:required": "sms:false", "sms:validationRules": [] }, { @@ -2118,6 +258,9 @@ "rdfs:subClassOf": [ { "@id": "bts:ValidValue" + }, + { + "@id": "bts:Diagnosis" } ], "schema:isPartOf": { @@ -2206,241 +349,180 @@ ] }, { - "@id": "bts:Biospecimen", + "@id": "bts:ValidValue", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Biospecimen", + "rdfs:label": "ValidValue", "rdfs:subClassOf": [ { - "@id": "bts:DataType" + "@id": "bts:Thing" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Biospecimen", + "sms:displayName": "ValidValue", "sms:required": "sms:false", - "sms:requiresComponent": [ - { - "@id": "bts:Patient" - } - ], - "sms:requiresDependency": [ - { - "@id": "bts:SampleID" - }, - { - "@id": "bts:PatientID" - }, - { - "@id": "bts:TissueStatus" - }, - { - "@id": "bts:Component" - } - ], "sms:validationRules": [] }, { - "@id": "bts:SampleID", + "@id": "bts:Breast", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "SampleID", + "rdfs:label": "Breast", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:CancerType" + }, + { + "@id": "bts:FamilyHistory" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Sample ID", - "sms:required": "sms:true", + "sms:displayName": "Breast", + "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:TissueStatus", + "@id": "bts:Colorectal", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "TissueStatus", + "rdfs:label": "Colorectal", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "schema:rangeIncludes": [ - { - "@id": "bts:Healthy" + "@id": "bts:CancerType" }, { - "@id": "bts:Malignant" - } - ], - "sms:displayName": "Tissue Status", - "sms:required": "sms:true", - "sms:validationRules": [] - }, - { - "@id": "bts:BulkRNA-seqAssay", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "BulkRNA-seqAssay", - "rdfs:subClassOf": [ - { - "@id": "bts:DataType" + "@id": "bts:FamilyHistory" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Bulk RNA-seq Assay", + "sms:displayName": "Colorectal", "sms:required": "sms:false", - "sms:requiresComponent": [ - { - "@id": "bts:Biospecimen" - } - ], - "sms:requiresDependency": [ - { - "@id": "bts:Filename" - }, - { - "@id": "bts:SampleID" - }, - { - "@id": "bts:FileFormat" - }, - { - "@id": "bts:Component" - } - ], "sms:validationRules": [] }, { - "@id": "bts:Filename", + "@id": "bts:Lung", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Filename", + "rdfs:label": "Lung", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:CancerType" + }, + { + "@id": "bts:FamilyHistory" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Filename", - "sms:required": "sms:true", + "sms:displayName": "Lung", + "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:FileFormat", + "@id": "bts:Prostate", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "FileFormat", + "rdfs:label": "Prostate", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:CancerType" + }, + { + "@id": "bts:FamilyHistory" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "schema:rangeIncludes": [ - { - "@id": "bts:FASTQ" - }, - { - "@id": "bts:BAM" - }, - { - "@id": "bts:CRAM" - }, - { - "@id": "bts:CSV/TSV" - } - ], - "sms:displayName": "File Format", - "sms:required": "sms:true", + "sms:displayName": "Prostate", + "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:BAM", + "@id": "bts:Skin", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "BAM", + "rdfs:label": "Skin", "rdfs:subClassOf": [ { - "@id": "bts:ValidValue" + "@id": "bts:CancerType" + }, + { + "@id": "bts:FamilyHistory" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "BAM", + "sms:displayName": "Skin", "sms:required": "sms:false", - "sms:requiresDependency": [ - { - "@id": "bts:GenomeBuild" - } - ], "sms:validationRules": [] }, { - "@id": "bts:CRAM", + "@id": "bts:Biospecimen", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CRAM", + "rdfs:label": "Biospecimen", "rdfs:subClassOf": [ { - "@id": "bts:ValidValue" + "@id": "bts:DataType" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "CRAM", + "sms:displayName": "Biospecimen", "sms:required": "sms:false", + "sms:requiresComponent": [ + { + "@id": "bts:Patient" + } + ], "sms:requiresDependency": [ { - "@id": "bts:GenomeBuild" + "@id": "bts:SampleID" }, { - "@id": "bts:GenomeFASTA" + "@id": "bts:PatientID" + }, + { + "@id": "bts:TissueStatus" + }, + { + "@id": "bts:Component" } ], "sms:validationRules": [] }, { - "@id": "bts:CSV/TSV", + "@id": "bts:SampleID", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CSV/TSV", - "rdfs:subClassOf": [ - { - "@id": "bts:ValidValue" - } - ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "sms:displayName": "CSV/TSV", - "sms:required": "sms:false", - "sms:requiresDependency": [ + "rdfs:label": "SampleID", + "rdfs:subClassOf": [ { - "@id": "bts:GenomeBuild" + "@id": "bts:DataProperty" } ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "Sample ID", + "sms:required": "sms:true", "sms:validationRules": [] }, { - "@id": "bts:GenomeBuild", + "@id": "bts:TissueStatus", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "GenomeBuild", + "rdfs:label": "TissueStatus", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2451,44 +533,38 @@ }, "schema:rangeIncludes": [ { - "@id": "bts:GRCh37" - }, - { - "@id": "bts:GRCh38" - }, - { - "@id": "bts:GRCm38" + "@id": "bts:Healthy" }, { - "@id": "bts:GRCm39" + "@id": "bts:Malignant" } ], - "sms:displayName": "Genome Build", + "sms:displayName": "Tissue Status", "sms:required": "sms:true", "sms:validationRules": [] }, { - "@id": "bts:GenomeFASTA", + "@id": "bts:Malignant", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "GenomeFASTA", + "rdfs:label": "Malignant", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:TissueStatus" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Genome FASTA", - "sms:required": "sms:true", + "sms:displayName": "Malignant", + "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:MockComponent", + "@id": "bts:BulkRNA-seqAssay", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "MockComponent", + "rdfs:label": "BulkRNA-seqAssay", "rdfs:subClassOf": [ { "@id": "bts:DataType" @@ -2497,80 +573,34 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "MockComponent", + "sms:displayName": "Bulk RNA-seq Assay", "sms:required": "sms:false", - "sms:requiresDependency": [ - { - "@id": "bts:Component" - }, - { - "@id": "bts:CheckList" - }, - { - "@id": "bts:CheckRegexList" - }, - { - "@id": "bts:CheckRegexSingle" - }, - { - "@id": "bts:CheckRegexFormat" - }, - { - "@id": "bts:CheckRegexInteger" - }, - { - "@id": "bts:CheckNum" - }, - { - "@id": "bts:CheckFloat" - }, - { - "@id": "bts:CheckInt" - }, - { - "@id": "bts:CheckString" - }, - { - "@id": "bts:CheckURL" - }, - { - "@id": "bts:CheckMatchatLeast" - }, - { - "@id": "bts:CheckMatchatLeastvalues" - }, - { - "@id": "bts:CheckMatchExactly" - }, - { - "@id": "bts:CheckMatchExactlyvalues" - }, - { - "@id": "bts:CheckRecommended" - }, + "sms:requiresComponent": [ { - "@id": "bts:CheckAges" - }, + "@id": "bts:Biospecimen" + } + ], + "sms:requiresDependency": [ { - "@id": "bts:CheckUnique" + "@id": "bts:Filename" }, { - "@id": "bts:CheckRange" + "@id": "bts:SampleID" }, { - "@id": "bts:CheckDate" + "@id": "bts:FileFormat" }, { - "@id": "bts:CheckNA" + "@id": "bts:Component" } ], "sms:validationRules": [] }, { - "@id": "bts:CheckList", + "@id": "bts:Filename", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckList", + "rdfs:label": "Filename", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2579,31 +609,15 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "schema:rangeIncludes": [ - { - "@id": "bts:Ab" - }, - { - "@id": "bts:Cd" - }, - { - "@id": "bts:Ef" - }, - { - "@id": "bts:Gh" - } - ], - "sms:displayName": "Check List", + "sms:displayName": "Filename", "sms:required": "sms:true", - "sms:validationRules": [ - "list strict" - ] + "sms:validationRules": [] }, { - "@id": "bts:CheckRegexList", + "@id": "bts:FileFormat", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckRegexList", + "rdfs:label": "FileFormat", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2612,132 +626,124 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Regex List", - "sms:required": "sms:true", - "sms:validationRules": [ - "list strict", - "regex match [a-f]" - ] - }, - { - "@id": "bts:CheckRegexSingle", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "CheckRegexSingle", - "rdfs:subClassOf": [ + "schema:rangeIncludes": [ { - "@id": "bts:DataProperty" + "@id": "bts:FASTQ" + }, + { + "@id": "bts:BAM" + }, + { + "@id": "bts:CRAM" + }, + { + "@id": "bts:CSV/TSV" } ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "sms:displayName": "Check Regex Single", + "sms:displayName": "File Format", "sms:required": "sms:true", - "sms:validationRules": [ - "regex search [a-f]" - ] + "sms:validationRules": [] }, { - "@id": "bts:CheckRegexFormat", + "@id": "bts:FASTQ", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckRegexFormat", + "rdfs:label": "FASTQ", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:FileFormat" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Regex Format", - "sms:required": "sms:true", - "sms:validationRules": [ - "regex match [a-f]" - ] + "sms:displayName": "FASTQ", + "sms:required": "sms:false", + "sms:validationRules": [] }, { - "@id": "bts:CheckRegexInteger", + "@id": "bts:BAM", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckRegexInteger", + "rdfs:label": "BAM", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:ValidValue" + }, + { + "@id": "bts:FileFormat" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Regex Integer", - "sms:required": "sms:true", - "sms:validationRules": [ - "regex search ^\\d+$" - ] - }, - { - "@id": "bts:CheckNum", - "@type": "rdfs:Class", - "rdfs:comment": "TBD", - "rdfs:label": "CheckNum", - "rdfs:subClassOf": [ + "sms:displayName": "BAM", + "sms:required": "sms:false", + "sms:requiresDependency": [ { - "@id": "bts:DataProperty" + "@id": "bts:GenomeBuild" } ], - "schema:isPartOf": { - "@id": "http://schema.biothings.io" - }, - "sms:displayName": "Check Num", - "sms:required": "sms:true", - "sms:validationRules": [ - "num" - ] + "sms:validationRules": [] }, { - "@id": "bts:CheckFloat", + "@id": "bts:CRAM", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckFloat", + "rdfs:label": "CRAM", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:ValidValue" + }, + { + "@id": "bts:FileFormat" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Float", - "sms:required": "sms:true", - "sms:validationRules": [ - "float" - ] + "sms:displayName": "CRAM", + "sms:required": "sms:false", + "sms:requiresDependency": [ + { + "@id": "bts:GenomeBuild" + }, + { + "@id": "bts:GenomeFASTA" + } + ], + "sms:validationRules": [] }, { - "@id": "bts:CheckInt", + "@id": "bts:CSV/TSV", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckInt", + "rdfs:label": "CSV/TSV", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:ValidValue" + }, + { + "@id": "bts:FileFormat" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Int", - "sms:required": "sms:true", - "sms:validationRules": [ - "int" - ] + "sms:displayName": "CSV/TSV", + "sms:required": "sms:false", + "sms:requiresDependency": [ + { + "@id": "bts:GenomeBuild" + } + ], + "sms:validationRules": [] }, { - "@id": "bts:CheckString", + "@id": "bts:GenomeBuild", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckString", + "rdfs:label": "GenomeBuild", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2746,17 +752,29 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check String", + "schema:rangeIncludes": [ + { + "@id": "bts:GRCh37" + }, + { + "@id": "bts:GRCh38" + }, + { + "@id": "bts:GRCm38" + }, + { + "@id": "bts:GRCm39" + } + ], + "sms:displayName": "Genome Build", "sms:required": "sms:true", - "sms:validationRules": [ - "str" - ] + "sms:validationRules": [] }, { - "@id": "bts:CheckURL", + "@id": "bts:GenomeFASTA", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckURL", + "rdfs:label": "GenomeFASTA", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2765,112 +783,165 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check URL", + "sms:displayName": "Genome FASTA", "sms:required": "sms:true", - "sms:validationRules": [ - "url" - ] + "sms:validationRules": [] }, { - "@id": "bts:CheckMatchatLeast", + "@id": "bts:GRCh37", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckMatchatLeast", + "rdfs:label": "GRCh37", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:GenomeBuild" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Match at Least", - "sms:required": "sms:true", - "sms:validationRules": [ - "matchAtLeastOne Patient.PatientID set" - ] + "sms:displayName": "GRCh37", + "sms:required": "sms:false", + "sms:validationRules": [] }, { - "@id": "bts:CheckMatchExactly", + "@id": "bts:GRCh38", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckMatchExactly", + "rdfs:label": "GRCh38", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:GenomeBuild" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Match Exactly", - "sms:required": "sms:true", - "sms:validationRules": [ - "matchExactlyOne MockComponent.checkMatchExactly set" - ] + "sms:displayName": "GRCh38", + "sms:required": "sms:false", + "sms:validationRules": [] }, { - "@id": "bts:CheckMatchatLeastvalues", + "@id": "bts:GRCm38", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckMatchatLeastvalues", + "rdfs:label": "GRCm38", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:GenomeBuild" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Match at Least values", - "sms:required": "sms:true", - "sms:validationRules": [ - "matchAtLeastOne MockComponent.checkMatchatLeastvalues value" - ] + "sms:displayName": "GRCm38", + "sms:required": "sms:false", + "sms:validationRules": [] }, { - "@id": "bts:CheckMatchExactlyvalues", + "@id": "bts:GRCm39", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckMatchExactlyvalues", + "rdfs:label": "GRCm39", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:GenomeBuild" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Match Exactly values", - "sms:required": "sms:true", - "sms:validationRules": [ - "matchExactlyOne MockComponent.checkMatchExactlyvalues value" - ] + "sms:displayName": "GRCm39", + "sms:required": "sms:false", + "sms:validationRules": [] }, { - "@id": "bts:CheckRecommended", + "@id": "bts:MockComponent", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckRecommended", + "rdfs:label": "MockComponent", "rdfs:subClassOf": [ { - "@id": "bts:DataProperty" + "@id": "bts:DataType" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Recommended", + "sms:displayName": "MockComponent", "sms:required": "sms:false", - "sms:validationRules": [ - "recommended" - ] + "sms:requiresDependency": [ + { + "@id": "bts:Component" + }, + { + "@id": "bts:CheckList" + }, + { + "@id": "bts:CheckRegexList" + }, + { + "@id": "bts:CheckRegexSingle" + }, + { + "@id": "bts:CheckRegexFormat" + }, + { + "@id": "bts:CheckRegexInteger" + }, + { + "@id": "bts:CheckNum" + }, + { + "@id": "bts:CheckFloat" + }, + { + "@id": "bts:CheckInt" + }, + { + "@id": "bts:CheckString" + }, + { + "@id": "bts:CheckURL" + }, + { + "@id": "bts:CheckMatchatLeast" + }, + { + "@id": "bts:CheckMatchatLeastvalues" + }, + { + "@id": "bts:CheckMatchExactly" + }, + { + "@id": "bts:CheckMatchExactlyvalues" + }, + { + "@id": "bts:CheckRecommended" + }, + { + "@id": "bts:CheckAges" + }, + { + "@id": "bts:CheckUnique" + }, + { + "@id": "bts:CheckRange" + }, + { + "@id": "bts:CheckDate" + }, + { + "@id": "bts:CheckNA" + } + ], + "sms:validationRules": [] }, { - "@id": "bts:CheckAges", + "@id": "bts:CheckList", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckAges", + "rdfs:label": "CheckList", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2879,17 +950,31 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Ages", + "schema:rangeIncludes": [ + { + "@id": "bts:Ab" + }, + { + "@id": "bts:Cd" + }, + { + "@id": "bts:Ef" + }, + { + "@id": "bts:Gh" + } + ], + "sms:displayName": "Check List", "sms:required": "sms:true", "sms:validationRules": [ - "protectAges" + "list strict" ] }, { - "@id": "bts:CheckUnique", + "@id": "bts:CheckRegexList", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckUnique", + "rdfs:label": "CheckRegexList", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2898,17 +983,18 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Unique", + "sms:displayName": "Check Regex List", "sms:required": "sms:true", "sms:validationRules": [ - "unique error" + "list strict", + "regex match [a-f]" ] }, { - "@id": "bts:CheckRange", + "@id": "bts:CheckRegexSingle", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckRange", + "rdfs:label": "CheckRegexSingle", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2917,17 +1003,17 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Range", + "sms:displayName": "Check Regex Single", "sms:required": "sms:true", "sms:validationRules": [ - "inRange 50 100 error" + "regex search [a-f]" ] }, { - "@id": "bts:CheckDate", + "@id": "bts:CheckRegexFormat", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckDate", + "rdfs:label": "CheckRegexFormat", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2936,17 +1022,17 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check Date", + "sms:displayName": "Check Regex Format", "sms:required": "sms:true", "sms:validationRules": [ - "date" + "regex match [a-f]" ] }, { - "@id": "bts:CheckNA", + "@id": "bts:CheckRegexInteger", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "CheckNA", + "rdfs:label": "CheckRegexInteger", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -2955,46 +1041,36 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Check NA", + "sms:displayName": "Check Regex Integer", "sms:required": "sms:true", "sms:validationRules": [ - "int", - "IsNA" + "regex search ^\\d+$" ] }, { - "@id": "bts:MockRDB", + "@id": "bts:CheckNum", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "MockRDB", + "rdfs:label": "CheckNum", "rdfs:subClassOf": [ { - "@id": "bts:DataType" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "MockRDB", - "sms:required": "sms:false", - "sms:requiresDependency": [ - { - "@id": "bts:Component" - }, - { - "@id": "bts:MockRDBId" - }, - { - "@id": "bts:SourceManifest" - } - ], - "sms:validationRules": [] + "sms:displayName": "Check Num", + "sms:required": "sms:true", + "sms:validationRules": [ + "num" + ] }, { - "@id": "bts:MockRDBId", + "@id": "bts:CheckFloat", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "MockRDBId", + "rdfs:label": "CheckFloat", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -3003,17 +1079,17 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "MockRDB_id", + "sms:displayName": "Check Float", "sms:required": "sms:true", "sms:validationRules": [ - "int" + "float" ] }, { - "@id": "bts:SourceManifest", + "@id": "bts:CheckInt", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "SourceManifest", + "rdfs:label": "CheckInt", "rdfs:subClassOf": [ { "@id": "bts:DataProperty" @@ -3022,270 +1098,297 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "SourceManifest", + "sms:displayName": "Check Int", "sms:required": "sms:true", - "sms:validationRules": [] + "sms:validationRules": [ + "int" + ] }, { - "@id": "bts:Female", + "@id": "bts:CheckString", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Female", + "rdfs:label": "CheckString", "rdfs:subClassOf": [ { - "@id": "bts:Sex" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Female", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check String", + "sms:required": "sms:true", + "sms:validationRules": [ + "str" + ] }, { - "@id": "bts:Male", + "@id": "bts:CheckURL", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Male", + "rdfs:label": "CheckURL", "rdfs:subClassOf": [ { - "@id": "bts:Sex" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Male", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check URL", + "sms:required": "sms:true", + "sms:validationRules": [ + "url" + ] }, { - "@id": "bts:Other", + "@id": "bts:CheckMatchatLeast", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Other", + "rdfs:label": "CheckMatchatLeast", "rdfs:subClassOf": [ { - "@id": "bts:Sex" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Other", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check Match at Least", + "sms:required": "sms:true", + "sms:validationRules": [ + "matchAtLeastOne Patient.PatientID set" + ] }, { - "@id": "bts:Healthy", + "@id": "bts:CheckMatchatLeastvalues", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Healthy", + "rdfs:label": "CheckMatchatLeastvalues", "rdfs:subClassOf": [ { - "@id": "bts:Diagnosis" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Healthy", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check Match at Least values", + "sms:required": "sms:true", + "sms:validationRules": [ + "matchAtLeastOne MockComponent.checkMatchatLeastvalues value" + ] }, { - "@id": "bts:Breast", + "@id": "bts:CheckMatchExactly", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Breast", + "rdfs:label": "CheckMatchExactly", "rdfs:subClassOf": [ { - "@id": "bts:CancerType" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Breast", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check Match Exactly", + "sms:required": "sms:true", + "sms:validationRules": [ + "matchExactlyOne MockComponent.checkMatchExactly set" + ] }, { - "@id": "bts:Colorectal", + "@id": "bts:CheckMatchExactlyvalues", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Colorectal", + "rdfs:label": "CheckMatchExactlyvalues", "rdfs:subClassOf": [ { - "@id": "bts:CancerType" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Colorectal", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check Match Exactly values", + "sms:required": "sms:true", + "sms:validationRules": [ + "matchExactlyOne MockComponent.checkMatchExactlyvalues value" + ] }, { - "@id": "bts:Lung", + "@id": "bts:CheckRecommended", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Lung", + "rdfs:label": "CheckRecommended", "rdfs:subClassOf": [ { - "@id": "bts:CancerType" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Lung", + "sms:displayName": "Check Recommended", "sms:required": "sms:false", - "sms:validationRules": [] + "sms:validationRules": [ + "recommended" + ] }, { - "@id": "bts:Prostate", + "@id": "bts:CheckAges", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Prostate", + "rdfs:label": "CheckAges", "rdfs:subClassOf": [ { - "@id": "bts:CancerType" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Prostate", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check Ages", + "sms:required": "sms:true", + "sms:validationRules": [ + "protectAges" + ] }, { - "@id": "bts:Skin", + "@id": "bts:CheckUnique", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Skin", + "rdfs:label": "CheckUnique", "rdfs:subClassOf": [ { - "@id": "bts:CancerType" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Skin", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check Unique", + "sms:required": "sms:true", + "sms:validationRules": [ + "unique error" + ] }, { - "@id": "bts:Malignant", + "@id": "bts:CheckRange", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Malignant", + "rdfs:label": "CheckRange", "rdfs:subClassOf": [ { - "@id": "bts:TissueStatus" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "Malignant", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check Range", + "sms:required": "sms:true", + "sms:validationRules": [ + "inRange 50 100 error" + ] }, { - "@id": "bts:FASTQ", + "@id": "bts:CheckDate", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "FASTQ", + "rdfs:label": "CheckDate", "rdfs:subClassOf": [ { - "@id": "bts:FileFormat" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "FASTQ", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check Date", + "sms:required": "sms:true", + "sms:validationRules": [ + "date" + ] }, { - "@id": "bts:GRCh37", + "@id": "bts:CheckNA", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "GRCh37", + "rdfs:label": "CheckNA", "rdfs:subClassOf": [ { - "@id": "bts:GenomeBuild" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "GRCh37", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "Check NA", + "sms:required": "sms:true", + "sms:validationRules": [ + "int", + "IsNA" + ] }, { - "@id": "bts:GRCh38", + "@id": "bts:Ab", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "GRCh38", + "rdfs:label": "Ab", "rdfs:subClassOf": [ { - "@id": "bts:GenomeBuild" + "@id": "bts:CheckList" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "GRCh38", + "sms:displayName": "ab", "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:GRCm38", + "@id": "bts:Cd", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "GRCm38", + "rdfs:label": "Cd", "rdfs:subClassOf": [ { - "@id": "bts:GenomeBuild" + "@id": "bts:CheckList" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "GRCm38", + "sms:displayName": "cd", "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:GRCm39", + "@id": "bts:Ef", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "GRCm39", + "rdfs:label": "Ef", "rdfs:subClassOf": [ { - "@id": "bts:GenomeBuild" + "@id": "bts:CheckList" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "GRCm39", + "sms:displayName": "ef", "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:Ab", + "@id": "bts:Gh", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Ab", + "rdfs:label": "Gh", "rdfs:subClassOf": [ { "@id": "bts:CheckList" @@ -3294,59 +1397,72 @@ "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "ab", + "sms:displayName": "gh", "sms:required": "sms:false", "sms:validationRules": [] }, { - "@id": "bts:Cd", + "@id": "bts:MockRDB", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Cd", + "rdfs:label": "MockRDB", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:DataType" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "cd", + "sms:displayName": "MockRDB", "sms:required": "sms:false", + "sms:requiresDependency": [ + { + "@id": "bts:Component" + }, + { + "@id": "bts:MockRDBId" + }, + { + "@id": "bts:SourceManifest" + } + ], "sms:validationRules": [] }, { - "@id": "bts:Ef", + "@id": "bts:MockRDBId", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Ef", + "rdfs:label": "MockRDBId", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "ef", - "sms:required": "sms:false", - "sms:validationRules": [] + "sms:displayName": "MockRDB_id", + "sms:required": "sms:true", + "sms:validationRules": [ + "int" + ] }, { - "@id": "bts:Gh", + "@id": "bts:SourceManifest", "@type": "rdfs:Class", "rdfs:comment": "TBD", - "rdfs:label": "Gh", + "rdfs:label": "SourceManifest", "rdfs:subClassOf": [ { - "@id": "bts:CheckList" + "@id": "bts:DataProperty" } ], "schema:isPartOf": { "@id": "http://schema.biothings.io" }, - "sms:displayName": "gh", - "sms:required": "sms:false", + "sms:displayName": "SourceManifest", + "sms:required": "sms:true", "sms:validationRules": [] } ], diff --git a/tests/data/properties.test.model.csv b/tests/data/properties.test.model.csv new file mode 100644 index 000000000..1f2121356 --- /dev/null +++ b/tests/data/properties.test.model.csv @@ -0,0 +1,6 @@ +Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules +cohorts,,,"Component, dataset_id, cohort_tag_id, id","name, dataset_id, cohort_tag_id, id",FALSE,,,, +cohort_tag_id,,,,,FALSE,,,,matchAtLeastOne tags.id set error +name,,,,,FALSE,,,, +dataset_id,,,,,FALSE,,,, +id,,,,,FALSE,,,, diff --git a/tests/data/validator_dag_test.model.csv b/tests/data/validator_dag_test.model.csv new file mode 100644 index 000000000..3184250ee --- /dev/null +++ b/tests/data/validator_dag_test.model.csv @@ -0,0 +1,44 @@ +Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules +Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis, Component",,FALSE,DataType,,, +Patient ID,,,Patient,,TRUE,DataProperty,,, +Sex,,"Female, Male, Other",,,TRUE,DataProperty,,, +Year of Birth,,,,,FALSE,DataProperty,,, +Diagnosis,,"Healthy, Cancer, Diagnosis",,,TRUE,DataProperty,,, +Cancer,,,"Cancer Type, Family History",,FALSE,ValidValue,,, +Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,, +Family History,,"Breast, Colorectal, Lung, Prostate, Skin",Cancer Type,,TRUE,DataProperty,,,list strict +Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient,, +Sample ID,,,,,TRUE,DataProperty,,, +Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,, +Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,, +Filename,,,,,TRUE,DataProperty,,, +File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,, +BAM,,,Genome Build,,FALSE,ValidValue,,, +CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,, +CSV/TSV,,,Genome Build,,FALSE,ValidValue,,, +Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,, +Genome FASTA,,,,,TRUE,DataProperty,,, +MockComponent,,,"Component, Check List, Check Regex List, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,, +Check List,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict +Check Regex List,,,,,TRUE,DataProperty,,,list strict::regex match [a-f] +Check Regex Single,,,,,TRUE,DataProperty,,,regex search [a-f] +Check Regex Format,,,,,TRUE,DataProperty,,,regex match [a-f] +Check Regex Integer,,,,,TRUE,DataProperty,,,regex search ^\d+$ +Check Num,,,,,TRUE,DataProperty,,,num +Check Float,,,,,TRUE,DataProperty,,,float +Check Int,,,,,TRUE,DataProperty,,,int +Check String,,,,,TRUE,DataProperty,,,str +Check URL,,,,,TRUE,DataProperty,,,url +Check Match at Least,,,,,TRUE,DataProperty,,,matchAtLeastOne Patient.PatientID set +Check Match Exactly,,,,,TRUE,DataProperty,,,matchExactlyOne MockComponent.checkMatchExactly set +Check Match at Least values,,,,,TRUE,DataProperty,,,matchAtLeastOne MockComponent.checkMatchatLeastvalues value +Check Match Exactly values,,,,,TRUE,DataProperty,,,matchExactlyOne MockComponent.checkMatchExactlyvalues value +Check Recommended,,,,,FALSE,DataProperty,,,recommended +Check Ages,,,,,TRUE,DataProperty,,,protectAges +Check Unique,,,,,TRUE,DataProperty,,,unique error +Check Range,,,,,TRUE,DataProperty,,,inRange 50 100 error +Check Date,,,,,TRUE,DataProperty,,,date +Check NA,,,,,TRUE,DataProperty,,,int::IsNA +MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,, +MockRDB_id,,,,,TRUE,DataProperty,,,int +SourceManifest,,,,,TRUE,DataProperty,,, diff --git a/tests/data/validator_test.model.csv b/tests/data/validator_test.model.csv new file mode 100644 index 000000000..b5b84760f --- /dev/null +++ b/tests/data/validator_test.model.csv @@ -0,0 +1,45 @@ +Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules +Patient),,,"Patient ID., Sex-, Year of Birth(, Diagnosis, Component",,FALSE,DataType,,, +Patient ID.,,,,,TRUE,DataProperty,,, +Sex-,,"Female, Male, Other",,,TRUE,DataProperty,,, +Year of Birth(,,,,,FALSE,DataProperty,,, +Diagnosis,,"Healthy, Cancer",,,TRUE,DataProperty,,, +Cancer,,,"Cancer Type, Family History",,FALSE,ValidValue,,, +Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,, +Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list strict +Biospecimen,,,"Sample ID, Patient ID., Tissue Status, Component",,FALSE,DataType,Patient,, +Sample ID,,,,,TRUE,DataProperty,,, +Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,, +Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,, +Filename,,,,,TRUE,DataProperty,,, +File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,, +BAM,,,Genome Build,,FALSE,ValidValue,,, +CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,, +CSV/TSV,,,Genome Build,,FALSE,ValidValue,,, +Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,, +Genome FASTA,,,,,TRUE,DataProperty,,, +MockComponent,,,"Component, Check List, Check Regex List, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,, +Check List,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict +Check Regex List,,,,,TRUE,DataProperty,,,list strict::regex match [a-f] +Check Regex Single,,,,,TRUE,DataProperty,,,regex search [a-f] +Check Regex Format,,,,,TRUE,DataProperty,,,regex match [a-f] +Check Regex Integer,,,,,TRUE,DataProperty,,,regex search ^\d+$ +Check Num,,,,,TRUE,DataProperty,,,num +Check Float,,,,,TRUE,DataProperty,,,float +Check Int,,,,,TRUE,DataProperty,,,int +Check String,,,,,TRUE,DataProperty,,,str +Check URL,,,,,TRUE,DataProperty,,,url +Check Match at Least,,,,,TRUE,DataProperty,,,matchAtLeastOne Patient.PatientID set +Check Match Exactly,,,,,TRUE,DataProperty,,,matchExactlyOne MockComponent.checkMatchExactly set +Check Match at Least values,,,,,TRUE,DataProperty,,,matchAtLeastOne MockComponent.checkMatchatLeastvalues value +Check Match Exactly values,,,,,TRUE,DataProperty,,,matchExactlyOne MockComponent.checkMatchExactlyvalues value +Check Recommended,,,,,FALSE,DataProperty,,,recommended +Check Ages,,,,,TRUE,DataProperty,,,protectAges +Check Unique,,,,,TRUE,DataProperty,,,unique error +Check Range,,,,,TRUE,DataProperty,,,inRange 50 100 error +Check Date,,,,,TRUE,DataProperty,,,date +Check NA,,,,,TRUE,DataProperty,,,int::IsNA +MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,, +MockRDB_id,,,,,TRUE,DataProperty,,,int +SourceManifest,,,,,TRUE,DataProperty,,, +entityId,,,,,TRUE,DataProperty,,, \ No newline at end of file diff --git a/tests/test_api.py b/tests/test_api.py index 2b1329ebe..29a55615e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -13,8 +13,10 @@ import pytest from schematic.configuration.configuration import Configuration -from schematic.schemas.generator import \ - SchemaGenerator # Local application/library specific imports. +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_relationships import DataModelRelationships + from schematic_api.api import create_app @@ -61,12 +63,14 @@ def test_manifest_json(helpers): @pytest.fixture(scope="class") def data_model_jsonld(): - data_model_jsonld ="https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.model.jsonld" + #data_model_jsonld ="https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.model.jsonld" + data_model_jsonld = "https://raw.githubusercontent.com/mialy-defelice/data_models/main/example.model.jsonld" yield data_model_jsonld @pytest.fixture(scope="class") def benchmark_data_model_jsonld(): - benchmark_data_model_jsonld = "https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.single_rule.model.jsonld" + #benchmark_data_model_jsonld = "https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.single_rule.model.jsonld" + benchmark_data_model_jsonld = "https://raw.githubusercontent.com/mialy-defelice/data_models/main/example.single_rule.model.jsonld" yield benchmark_data_model_jsonld def get_MockComponent_attribute(): @@ -74,8 +78,21 @@ def get_MockComponent_attribute(): Yield all of the mock conponent attributes one at a time TODO: pull in jsonld from fixture """ - sg = SchemaGenerator("https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.single_rule.model.jsonld") - attributes=sg.get_node_dependencies('MockComponent') + #schema_url = "https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.single_rule.model.jsonld" + schema_url = "https://raw.githubusercontent.com/mialy-defelice/data_models/main/example.single_rule.model.jsonld" + data_model_parser = DataModelParser(path_to_data_model = schema_url) + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + dmge = DataModelGraphExplorer(graph_data_model) + #sg = SchemaGenerator("https://raw.githubusercontent.com/Sage-Bionetworks/schematic/develop/tests/data/example.single_rule.model.jsonld") + attributes=dmge.get_node_dependencies('MockComponent') attributes.remove('Component') for MockComponent_attribute in attributes: @@ -251,16 +268,15 @@ def test_component_requirement(self, client, data_model_jsonld, as_graph): @pytest.mark.schematic_api -class TestSchemaExplorerOperation: +class TestUtilsOperation: @pytest.mark.parametrize("strict_camel_case", [True, False]) - def test_get_property_label_from_display_name(self, client, data_model_jsonld, strict_camel_case): + def test_get_property_label_from_display_name(self, client, strict_camel_case): params = { - "schema_url": data_model_jsonld, "display_name": "mocular entity", "strict_camel_case": strict_camel_case } - response = client.get("http://localhost:3001/v1/explorer/get_property_label_from_display_name", query_string = params) + response = client.get("http://localhost:3001/v1/utils/get_property_label_from_display_name", query_string = params) assert response.status_code == 200 response_dt = json.loads(response.data) @@ -270,6 +286,9 @@ def test_get_property_label_from_display_name(self, client, data_model_jsonld, s else: assert response_dt == "mocularentity" + +@pytest.mark.schematic_api +class TestDataModelGraphExplorerOperation: def test_get_schema(self, client, data_model_jsonld): params = { "schema_url": data_model_jsonld @@ -315,9 +334,6 @@ def test_get_nodes_display_names(test, client, data_model_jsonld): assert response.status_code == 200 assert "Family History" and "Biospecimen" in response_dta - -@pytest.mark.schematic_api -class TestSchemaGeneratorOperation: @pytest.mark.parametrize("relationship", ["parentOf", "requiresDependency", "rangeValue", "domainValue"]) def test_get_subgraph_by_edge(self, client, data_model_jsonld, relationship): params = { @@ -338,7 +354,7 @@ def test_get_node_range(self, client, data_model_jsonld, return_display_names, n "node_label": node_label } - response = client.get('http://localhost:3001/v1/explorer/get_node_range', query_string=params) + response = client.get('http://localhost:3001/v1/schemas/get_node_range', query_string=params) response_dt = json.loads(response.data) assert response.status_code == 200 @@ -365,7 +381,7 @@ def test_node_dependencies(self, client, data_model_jsonld, source_node, return_ "return_schema_ordered": return_schema_ordered } - response = client.get('http://localhost:3001/v1/explorer/get_node_dependencies', query_string=params) + response = client.get('http://localhost:3001/v1/schemas/get_node_dependencies', query_string=params) response_dt = json.loads(response.data) assert response.status_code == 200 @@ -547,6 +563,7 @@ def test_generate_manifest_file_based_annotations(self, client, use_annotations, # and also make sure that entityId column appears in the end assert google_sheet_df.columns.to_list()[-1] == "entityId" + assert sorted(google_sheet_df.columns.to_list()) == sorted(expected) # make sure Filename, entityId, and component get filled with correct value diff --git a/tests/test_cli.py b/tests/test_cli.py index f3cd19a59..5498a5900 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -64,7 +64,7 @@ def test_get_example_manifest_default(self, runner, helpers, config: Configurati config.load_config("config_example.yml") result = runner.invoke( - manifest, ["--config", config.config_path, "get", "--data_type", "Patient", "--jsonld", data_model_jsonld] + manifest, ["--config", config.config_path, "get", "--data_type", "Patient", "--path_to_data_model", data_model_jsonld] ) @@ -79,7 +79,7 @@ def test_get_example_manifest_csv(self, runner, helpers, config: Configuration, config.load_config("config_example.yml") result = runner.invoke( - manifest, ["--config", config.config_path, "get", "--data_type", "Patient", "--jsonld", data_model_jsonld, "--output_csv", output_path] + manifest, ["--config", config.config_path, "get", "--data_type", "Patient", "--path_to_data_model", data_model_jsonld, "--output_csv", output_path] ) assert result.exit_code == 0 self.assert_expected_file(result, output_path) @@ -91,7 +91,7 @@ def test_get_example_manifest_excel(self, runner, helpers, config: Configuration config.load_config("config_example.yml") result = runner.invoke( - manifest, ["--config", config.config_path, "get", "--data_type", "Patient", "--jsonld", data_model_jsonld, "--output_xlsx", output_path] + manifest, ["--config", config.config_path, "get", "--data_type", "Patient", "--path_to_data_model", data_model_jsonld, "--output_xlsx", output_path] ) assert result.exit_code == 0 diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 2ea337ca7..417b86b47 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -7,7 +7,9 @@ from unittest.mock import patch from unittest.mock import MagicMock from schematic.manifest.generator import ManifestGenerator -from schematic.schemas.generator import SchemaGenerator +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph,DataModelGraphExplorer +from schematic.schemas.data_model_json_schema import DataModelJSONSchema from schematic.configuration.configuration import Configuration from schematic.utils.google_api_utils import execute_google_api_requests @@ -17,6 +19,25 @@ logger = logging.getLogger(__name__) +def generate_graph_data_model(helpers, path_to_data_model): + """ + Simple helper function to generate a networkx graph data model from a CSV or JSONLD data model + """ + + # Instantiate Parser + data_model_parser = DataModelParser(path_to_data_model=path_to_data_model) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Convert parsed model to graph + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + return graph_data_model @pytest.fixture( params=[ @@ -32,13 +53,21 @@ "skip_annotations-BulkRNAseqAssay", ], ) + def manifest_generator(helpers, request): # Rename request param for readability use_annotations, data_type = request.param + path_to_data_model = helpers.get_data_path("example.model.jsonld") + + # Get graph data model + graph_data_model = generate_graph_data_model(helpers, path_to_data_model=path_to_data_model) + + manifest_generator = ManifestGenerator( - path_to_json_ld=helpers.get_data_path("example.model.jsonld"), + path_to_data_model=path_to_data_model, + graph=graph_data_model, root=data_type, use_annotations=use_annotations, ) @@ -83,17 +112,23 @@ def manifest(dataset_id, manifest_generator, request): class TestManifestGenerator: def test_init(self, helpers): + path_to_data_model = helpers.get_data_path("example.model.jsonld") + + # Get graph data model + graph_data_model = generate_graph_data_model(helpers, path_to_data_model=path_to_data_model) + generator = ManifestGenerator( + graph=graph_data_model, title="mock_title", - path_to_json_ld=helpers.get_data_path("example.model.jsonld"), - root = "Patient" + path_to_data_model=path_to_data_model, + root = "Patient", ) assert type(generator.title) is str # assert generator.sheet_service == mock_creds["sheet_service"] assert generator.root is "Patient" - assert type(generator.sg) is SchemaGenerator + assert type(generator.dmge) is DataModelGraphExplorer @pytest.mark.parametrize("data_type, exc, exc_message", [("MissingComponent", LookupError, "could not be found in the data model schema"), @@ -103,11 +138,17 @@ def test_missing_root_error(self, helpers, data_type, exc, exc_message): """ Test for errors when either no DataType is provided or when a DataType is provided but not found in the schema """ + path_to_data_model = helpers.get_data_path("example.model.jsonld") + + # Get graph data model + graph_data_model = generate_graph_data_model(helpers, path_to_data_model=path_to_data_model) + # A LookupError should be raised and include message when the component cannot be found with pytest.raises(exc) as e: generator = ManifestGenerator( - path_to_json_ld=helpers.get_data_path("example.model.jsonld"), + path_to_data_model=helpers.get_data_path("example.model.jsonld"), + graph=graph_data_model, root=data_type, use_annotations=False, ) @@ -128,7 +169,6 @@ def test_get_manifest_first_time(self, manifest): return # Beyond this point, the output is assumed to be a data frame - # Update expectations based on whether the data type is file-based is_file_based = data_type in ["BulkRNA-seqAssay"] @@ -180,13 +220,20 @@ def test_get_manifest_excel(self, helpers, sheet_url, output_format, dataset_id) data_type = "Patient" + # Get path to data model + path_to_data_model = helpers.get_data_path("example.model.jsonld") + + # Get graph data model + graph_data_model = generate_graph_data_model(helpers, path_to_data_model=path_to_data_model) + + generator = ManifestGenerator( - path_to_json_ld=helpers.get_data_path("example.model.jsonld"), + path_to_data_model=path_to_data_model, + graph=graph_data_model, root=data_type, use_annotations=False, ) - manifest= generator.get_manifest(dataset_id=dataset_id, sheet_url = sheet_url, output_format = output_format) # if dataset id exists, it could return pandas dataframe, google spreadsheet, or an excel spreadsheet @@ -224,9 +271,17 @@ def test_get_manifest_no_annos(self, helpers, dataset_id): # Use a non-file based DataType data_type = "Patient" + # Get path to data model + path_to_data_model = helpers.get_data_path("example.model.jsonld") + + # Get graph data model + graph_data_model = generate_graph_data_model(helpers, path_to_data_model=path_to_data_model) + + # Instantiate object with use_annotations set to True generator = ManifestGenerator( - path_to_json_ld=helpers.get_data_path("example.model.jsonld"), + path_to_data_model=path_to_data_model, + graph=graph_data_model, root=data_type, use_annotations=True, ) @@ -274,7 +329,7 @@ def test_get_json_schema(self, simple_manifest_generator, helpers, schema_path_p else: mock_json_schema = Mock() mock_json_schema.return_value = "mock json ld" - with patch.object(SchemaGenerator, "get_json_schema_requirements",mock_json_schema): + with patch.object(DataModelJSONSchema, "get_json_validation_schema",mock_json_schema): json_schema = generator._get_json_schema(json_schema_filepath=None) assert json_schema == "mock json ld" @@ -303,8 +358,15 @@ def test_gather_all_fields(self, simple_manifest_generator): # assume there is no existing additional metadata @pytest.mark.parametrize("data_type,required_metadata_fields,expected", [("Patient", {"Component": []}, {'Component': ['Patient']}), ("BulkRNA-seqAssay", {"Filename": [], "Component":[]}, {'Component': ['BulkRNA-seqAssay']})]) def test_add_root_to_component_without_additional_metadata(self, helpers, data_type, required_metadata_fields, expected): + # Get path to data model + path_to_data_model = helpers.get_data_path("example.model.jsonld") + + # Get graph data model + graph_data_model = generate_graph_data_model(helpers, path_to_data_model=path_to_data_model) + manifest_generator = ManifestGenerator( - path_to_json_ld=helpers.get_data_path("example.model.jsonld"), + path_to_data_model=path_to_data_model, + graph=graph_data_model, root=data_type, ) manifest_generator._add_root_to_component(required_metadata_fields) @@ -316,8 +378,15 @@ def test_add_root_to_component_without_additional_metadata(self, helpers, data_t # assume there is additional metadata @pytest.mark.parametrize("additional_metadata", [{'author': ['test', '', ], 'Filename': ['test.txt', 'test2.txt'], 'Component': []}, {'Year of Birth': ['1988'], 'Filename': ['test.txt'], 'Component': []}]) def test_add_root_to_component_with_additional_metadata(self, helpers, additional_metadata): + # Get path to data model + path_to_data_model = helpers.get_data_path("example.model.jsonld") + + # Get graph data model + graph_data_model = generate_graph_data_model(helpers, path_to_data_model=path_to_data_model) + manifest_generator = ManifestGenerator( - path_to_json_ld=helpers.get_data_path("example.model.jsonld"), + path_to_data_model=path_to_data_model, + graph=graph_data_model, root="BulkRNA-seqAssay" ) @@ -361,8 +430,15 @@ def test_update_dataframe_with_existing_df(self, helpers, existing_manifest): data_type = "Patient" sheet_url = True + path_to_data_model = helpers.get_data_path("example.model.jsonld") + + # Get graph data model + graph_data_model = generate_graph_data_model(helpers, path_to_data_model=path_to_data_model) + + # Instantiate the Manifest Generator. - generator = ManifestGenerator(path_to_json_ld=helpers.get_data_path("example.model.jsonld"), + generator = ManifestGenerator(path_to_data_model=path_to_data_model, + graph=graph_data_model, root=data_type, use_annotations=False, ) diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 2444d5f44..cccdb0208 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -1,294 +1,1462 @@ -import os +from copy import deepcopy +import json import logging - +import networkx as nx +import numpy as np +import os import pandas as pd import pytest +import random + +from schematic.schemas.data_model_edges import DataModelEdges +from schematic.schemas.data_model_nodes import DataModelNodes +from schematic.schemas.data_model_relationships import DataModelRelationships -from schematic.schemas import df_parser from schematic.utils.df_utils import load_df -from schematic.schemas.generator import SchemaGenerator +from schematic.utils.schema_utils import ( + get_label_from_display_name, + get_attribute_display_name_from_label, + convert_bool_to_str, + parse_validation_rules, +) +from schematic.utils.io_utils import load_json + +from schematic.schemas.data_model_graph import DataModelGraph +from schematic.schemas.data_model_nodes import DataModelNodes +from schematic.schemas.data_model_edges import DataModelEdges +from schematic.schemas.data_model_graph import DataModelGraphExplorer +from schematic.schemas.data_model_relationships import DataModelRelationships +from schematic.schemas.data_model_jsonld import ( + DataModelJsonLD, + convert_graph_to_jsonld, + BaseTemplate, + PropertyTemplate, + ClassTemplate, +) +from schematic.schemas.data_model_json_schema import DataModelJSONSchema +from schematic.schemas.data_model_parser import ( + DataModelParser, + DataModelCSVParser, + DataModelJSONLDParser, +) logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) +DATA_MODEL_DICT = {"example.model.csv": "CSV", "example.model.jsonld": "JSONLD"} -@pytest.fixture -def extended_schema_path(helpers, tmp_path): - data_model_csv_path = helpers.get_data_path("example.model.csv") - - example_model_df = load_df(data_model_csv_path) - - # additional "Assay" attribute to be added to example schema - assay_attr_row = { - "Attribute": "Assay", - "Description": ( - "A planned process with the objective to produce information " - "about the material entity that is the evaluant, by physically " - "examining it or its proxies.[OBI_0000070]" - ), - "Valid Values": "", - "DependsOn": "", - "Properties": "", - "Required": False, - "Parent": "", - "DependsOn Component": "", - "Source": "http://purl.obolibrary.org/obo/OBI_0000070", - "Validation Rules": "", - } - - example_model_df = example_model_df.append(assay_attr_row, ignore_index=True) - - # create empty temporary file to write extended schema to - schemas_folder = tmp_path / "schemas" - schemas_folder.mkdir() - extended_schema_path = schemas_folder / "extended_example.model.csv" - - example_model_df.to_csv(extended_schema_path) - - yield extended_schema_path -@pytest.fixture -def sg(helpers): +def test_fake_func(): + return + + +REL_FUNC_DICT = { + "get_attribute_display_name_from_label": get_attribute_display_name_from_label, + "parse_validation_rules": parse_validation_rules, + "get_label_from_display_name": get_label_from_display_name, + "convert_bool_to_str": convert_bool_to_str, + "test_fake_func": test_fake_func, +} +TEST_DN_DICT = { + "Bio Things": {"class": "BioThings", "property": "bioThings"}, + "bio things": {"class": "Biothings", "property": "biothings"}, +} +NODE_DISPLAY_NAME_DICT = {"Patient": False, "Sex": True} + + +def get_data_model_parser(helpers, data_model_name: str = None): + # Get path to data model + fullpath = helpers.get_data_path(path=data_model_name) + + # Instantiate DataModelParser + data_model_parser = DataModelParser(path_to_data_model=fullpath) + return data_model_parser + + +def generate_graph_data_model(helpers, data_model_name: str) -> nx.MultiDiGraph: + """ + Simple helper function to generate a networkx graph data model from a CSV or JSONLD data model + """ + + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model_name + ) + + # Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Convert parsed model to graph + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + return graph_data_model + + +def generate_data_model_nodes(helpers, data_model_name: str) -> DataModelNodes: + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model_name + ) + # Parse Model + parsed_data_model = data_model_parser.parse_model() + # Instantiate DataModelNodes + data_model_nodes = DataModelNodes(attribute_relationships_dict=parsed_data_model) + return data_model_nodes + + +def get_data_model_json_schema(helpers, data_model_name: str = None): + # Get path to data model + fullpath = helpers.get_data_path(path=data_model_name) + + # Get Data Model Graph + graph_data_model = generate_graph_data_model( + helpers, data_model_name=data_model_name + ) + + # Instantiate DataModelJsonSchema + dmjs = DataModelJSONSchema(fullpath, graph=graph_data_model) + return dmjs - inputModelLocation = helpers.get_data_path('example.model.jsonld') - sg = SchemaGenerator(inputModelLocation) - yield sg +@pytest.fixture(name="relationships") +def get_relationships(helpers): + DMR = DataModelRelationships() + relationships_dict = DMR.relationships_dictionary + relationships = list(relationships_dict.keys()) + yield relationships -class TestDfParser: - def test_get_class(self, helpers): - se_obj = helpers.get_schema_explorer("example.model.jsonld") +@pytest.fixture(name="DMR") +def fixture_dmr(): + """Yields a data model relationships object for testing""" + yield DataModelRelationships() - actual = df_parser.get_class( - se=se_obj, - class_display_name="Test", - description="This is a dummy test class", - subclass_of=["Thing"], - requires_dependencies=["Test_Dep_1", "Test_Dep_2"], - requires_range=["Test_Start", "Test_End"], - requires_components=["Test_Comp_1", "Test_Comp_2"], - required=True, - validation_rules=["Rule_1", "Rule_2"], + +@pytest.fixture(name="csv_parser") +def fixture_dm_csv_parser(): + yield DataModelCSVParser() + + +@pytest.fixture(name="jsonld_parser") +def fixture_dm_jsonld_parser(): + yield DataModelJSONLDParser() + + +@pytest.fixture +def data_model_edges(): + """ + Yields a Data Model Edges object for testing + TODO: Update naming for DataModelGraphExplorer and fixture to avoid overlapping namespace + """ + yield DataModelEdges() + + +class TestDataModelParser: + def test_get_base_schema_path(self, helpers): + """Test that base schema path is returned properly. + Note: + data model parser class does not currently accept an new path to a base schema, + so just test that default BioThings data model path is returned. + """ + # Instantiate Data model parser. + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name="example.model.csv" ) - expected = { - "@id": "bts:Test", - "@type": "rdfs:Class", - "rdfs:comment": "This is a dummy test class", - "rdfs:label": "Test", - "rdfs:subClassOf": [{"@id": "bts:Thing"}], - "schema:isPartOf": {"@id": "http://schema.biothings.io"}, - "schema:rangeIncludes": [{"@id": "bts:TestStart"}, {"@id": "bts:TestEnd"}], - "sms:displayName": "Test", - "sms:required": "sms:true", - "sms:requiresComponent": [ - {"@id": "bts:Test_Comp_1"}, - {"@id": "bts:Test_Comp_2"}, - ], - "sms:requiresDependency": [ - {"@id": "bts:Test_Dep_1"}, - {"@id": "bts:Test_Dep_2"}, - ], - "sms:validationRules": ["Rule_1", "Rule_2"], - } + # Get path to default biothings model. + biothings_path = data_model_parser._get_base_schema_path(base_schema=None) - assert expected == actual + assert os.path.basename(biothings_path) == "biothings.model.jsonld" - def test_get_property(self, helpers): + @pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) + ) + def test_get_model_type(self, helpers, data_model: str): + # Instantiate Data model parser. + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model + ) - se_obj = helpers.get_schema_explorer("example.model.jsonld") + # Check the data model type + assert (data_model == "example.model.csv") == ( + data_model_parser.model_type == "CSV" + ) + assert (data_model == "example.model.jsonld") == ( + data_model_parser.model_type == "JSONLD" + ) - actual = df_parser.get_property( - se=se_obj, - property_display_name="Test", - property_class_names=["Prop_Class"], - description="This is a dummy test property", - requires_range=["Test_Start", "Test_End"], - requires_dependencies=["Test_Dep_1", "Test_Dep_2"], - required=True, - validation_rules=["Rule_1", "Rule_2"], + @pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) + ) + def test_parse_model(self, helpers, data_model: str): + """Test that the correct parser is called and that a dictionary is returned in the expected structure.""" + # Instantiate Data model parser. + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model ) - expected = { - "@id": "bts:test", - "@type": "rdf:Property", - "rdfs:comment": "This is a dummy test property", - "rdfs:label": "test", - "schema:isPartOf": {"@id": "http://schema.biothings.io"}, - "schema:rangeIncludes": [{"@id": "bts:TestStart"}, {"@id": "bts:TestEnd"}], - "sms:displayName": "Test", - "sms:required": "sms:true", - "schema:domainIncludes": [{"@id": "bts:PropClass"}], - "sms:requiresDependency": [ - {"@id": "bts:Test_Dep_1"}, - {"@id": "bts:Test_Dep_2"}, - ], - "sms:validationRules": ["Rule_1", "Rule_2"], - } + # Parse Model + attr_rel_dictionary = data_model_parser.parse_model() + + # Get a key in the model + attribute_key = list(attr_rel_dictionary.keys())[0] + + # Check that the structure of the model dictionary conforms to expectations. + assert type(attr_rel_dictionary) == dict + assert attribute_key in attr_rel_dictionary.keys() + assert "Relationships" in attr_rel_dictionary[attribute_key] + assert "Attribute" in attr_rel_dictionary[attribute_key]["Relationships"] + + +@pytest.mark.parametrize("data_model", ["example.model.csv"], ids=["csv"]) +class TestDataModelCsvParser: + def test_check_schema_definition( + self, helpers, data_model: str, csv_parser: DataModelCSVParser + ): + """If the csv schema contains the required headers, then this function should not return anything. Check that this is so.""" + # path_to_data_model = helpers.get_data_path(path=data_model) + model_df = helpers.get_data_frame(path=data_model, data_model=True) + assert None == (csv_parser.check_schema_definition(model_df=model_df)) + + def test_gather_csv_attributes_relationships( + self, helpers, data_model: str, csv_parser: DataModelCSVParser + ): + """The output of the function is a attributes relationship dictionary, check that it is formatted properly.""" + path_to_data_model = helpers.get_data_path(path=data_model) + model_df = load_df(path_to_data_model, data_model=True) + + # Get output of the function: + attr_rel_dict = csv_parser.gather_csv_attributes_relationships( + model_df=model_df + ) - assert expected == actual + # Test the attr_rel_dict is formatted as expected: + # Get a key in the model + attribute_key = list(attr_rel_dict.keys())[0] + + # Check that the structure of the model dictionary conforms to expectations. + assert type(attr_rel_dict) == dict + assert attribute_key in attr_rel_dict.keys() + assert "Relationships" in attr_rel_dict[attribute_key] + assert "Attribute" in attr_rel_dict[attribute_key]["Relationships"] + + def test_parse_csv_model( + self, helpers, data_model: str, csv_parser: DataModelCSVParser + ): + """The output of the function is a attributes relationship dictionary, check that it is formatted properly.""" + path_to_data_model = helpers.get_data_path(path=data_model) + model_df = load_df(path_to_data_model, data_model=True) + + # Get output of the function: + attr_rel_dictionary = csv_parser.parse_csv_model( + path_to_data_model=path_to_data_model + ) - def test_attribute_exists(self, helpers): + # Test the attr_rel_dictionary is formatted as expected: + # Get a key in the model + attribute_key = list(attr_rel_dictionary.keys())[0] + + # Check that the structure of the model dictionary conforms to expectations. + assert type(attr_rel_dictionary) == dict + assert attribute_key in attr_rel_dictionary.keys() + assert "Relationships" in attr_rel_dictionary[attribute_key] + assert "Attribute" in attr_rel_dictionary[attribute_key]["Relationships"] + + +@pytest.mark.parametrize("data_model", ["example.model.jsonld"], ids=["jsonld"]) +class TestDataModelJsonLdParser: + def test_gather_jsonld_attributes_relationships( + self, helpers, data_model: str, jsonld_parser: DataModelJSONLDParser + ): + """The output of the function is a attributes relationship dictionary, check that it is formatted properly.""" + path_to_data_model = helpers.get_data_path(path=data_model) + model_jsonld = load_json(path_to_data_model) + + # Get output of the function: + attr_rel_dict = jsonld_parser.gather_jsonld_attributes_relationships( + model_jsonld=model_jsonld["@graph"] + ) - se_obj = helpers.get_schema_explorer("example.model.jsonld") + # Test the attr_rel_dict is formatted as expected: + # Get a key in the model + attribute_key = list(attr_rel_dict.keys())[0] + + # Check that the structure of the model dictionary conforms to expectations. + assert type(attr_rel_dict) == dict + assert attribute_key in attr_rel_dict.keys() + assert "Relationships" in attr_rel_dict[attribute_key] + assert "Attribute" in attr_rel_dict[attribute_key]["Relationships"] + + def test_parse_jsonld_model( + self, helpers, data_model: str, jsonld_parser: DataModelJSONLDParser + ): + """The output of the function is a attributes relationship dictionary, check that it is formatted properly.""" + path_to_data_model = helpers.get_data_path(path=data_model) + model_jsonld = load_json(path_to_data_model) + + # Get output of the function: + attr_rel_dictionary = jsonld_parser.parse_jsonld_model( + path_to_data_model=path_to_data_model + ) - # test when attribute is present in data model - attribute_present = df_parser.attribute_exists(se_obj, "Patient") + # Test the attr_rel_dictionary is formatted as expected: + # Get a key in the model + attribute_key = list(attr_rel_dictionary.keys())[0] + + # Check that the structure of the model dictionary conforms to expectations. + assert type(attr_rel_dictionary) == dict + assert attribute_key in attr_rel_dictionary.keys() + assert "Relationships" in attr_rel_dictionary[attribute_key] + assert "Attribute" in attr_rel_dictionary[attribute_key]["Relationships"] + + +class TestDataModelRelationships: + """Tests for DataModelRelationships class""" + + def test_define_data_model_relationships(self, DMR: DataModelRelationships): + """Tests relationships_dictionary created has correct keys""" + required_keys = [ + "jsonld_key", + "csv_header", + "type", + "edge_rel", + "required_header", + ] + required_edge_keys = ["edge_key", "edge_dir"] + required_node_keys = ["node_label", "node_attr_dict"] + + relationships = DMR.relationships_dictionary + + for relationship in relationships.values(): + for key in required_keys: + assert key in relationship.keys() + if relationship["edge_rel"]: + for key in required_edge_keys: + assert key in relationship.keys() + else: + for key in required_node_keys: + assert key in relationship.keys() + + def test_define_required_csv_headers(self, DMR: DataModelRelationships): + """Tests method returns correct values""" + assert DMR.define_required_csv_headers() == [ + "Attribute", + "Description", + "Valid Values", + "DependsOn", + "DependsOn Component", + "Required", + "Parent", + "Validation Rules", + "Properties", + "Source", + ] + + @pytest.mark.parametrize("edge", [True, False], ids=["True", "False"]) + def test_retreive_rel_headers_dict(self, DMR: DataModelRelationships, edge: bool): + """Tests method returns correct values""" + if edge: + assert DMR.retreive_rel_headers_dict(edge=edge) == { + "rangeIncludes": "Valid Values", + "requiresDependency": "DependsOn", + "requiresComponent": "DependsOn Component", + "subClassOf": "Parent", + "domainIncludes": "Properties", + } + else: + assert DMR.retreive_rel_headers_dict(edge=edge) == { + "displayName": "Attribute", + "label": None, + "comment": "Description", + "required": "Required", + "validationRules": "Validation Rules", + "isPartOf": None, + "id": "Source", + } + + +class TestDataModelGraph: + @pytest.mark.parametrize( + "data_model", + ["example.model.csv", "example.model.jsonld"], + ids=["csv", "jsonld"], + ) + def test_generate_data_model_graph(self, helpers, data_model): + """Check that data model graph is constructed properly, requires calling various classes. + TODO: In another test, check conditional dependencies. + """ + graph = generate_graph_data_model(helpers=helpers, data_model_name=data_model) + + # Check that some edges are present as expected: + assert ("FamilyHistory", "Breast") in graph.edges("FamilyHistory") + assert ("BulkRNA-seqAssay", "Biospecimen") in graph.edges("BulkRNA-seqAssay") + assert ["Ab", "Cd", "Ef", "Gh"] == [ + k + for k, v in graph["CheckList"].items() + for vk, vv in v.items() + if vk == "rangeValue" + ] + + # Check that all relationships recorded between 'CheckList' and 'Ab' are present + assert "rangeValue" and "parentOf" in graph["CheckList"]["Ab"] + assert "requiresDependency" not in graph["CheckList"]["Ab"] + + # Check nodes: + assert "Patient" in graph.nodes + assert "GRCh38" in graph.nodes + + # Check weights + assert graph["Sex"]["Female"]["rangeValue"]["weight"] == 0 + assert ( + graph["MockComponent"]["CheckRegexFormat"]["requiresDependency"]["weight"] + == 4 + ) - # test when attribute is not present in data model - attribute_absent = df_parser.attribute_exists(se_obj, "RandomAttribute") + # Check Edge directions + assert 4 == (len(graph.out_edges("TissueStatus"))) + assert 2 == (len(graph.in_edges("TissueStatus"))) - assert attribute_present - assert not attribute_absent - def test_check_schema_definition(self, helpers): +class TestDataModelGraphExplorer: + def test_find_properties(self): + return - data_model_csv_path = helpers.get_data_path("example.model.csv") + def test_find_classes(self): + return - example_model_df = load_df(data_model_csv_path) + def test_find_node_range(self): + return - # when all required headers are provided in the CSV data model - actual_df = df_parser.check_schema_definition(example_model_df) + def test_get_adjacent_nodes_by_relationship(self): + return - assert actual_df is None + def test_get_component_requirements(self): + return - # when either "Requires" or "Requires Component" is present - # in column headers, raise ValueError - if "DependsOn Component" in example_model_df.columns: - del example_model_df["DependsOn Component"] + def test_get_component_requirements_graph(self): + return - example_model_df["Requires Component"] = "" + def get_descendants_by_edge_type(self): + return - with pytest.raises(ValueError): - df_parser.check_schema_definition(example_model_df) + def test_get_digraph_by_edge_type(self): + return - def test_create_nx_schema_objects(self, helpers, extended_schema_path): + def test_get_edges_by_relationship(self): + return - se_obj = helpers.get_schema_explorer("example.model.jsonld") + def test_get_ordered_entry(self): + return - # path to extended CSV data model which has one additional attribute - # namely, "Assay" - extended_csv_model_path = helpers.get_data_path(extended_schema_path) + def test_get_nodes_ancestors(self): + return - extended_model_df = load_df(extended_csv_model_path, data_model=True) + def test_get_node_comment(self): + return - extended_csv_model_se = df_parser.create_nx_schema_objects( - extended_model_df, se_obj - ) + def test_get_node_dependencies(self): + return - # check if the "Assay" attribute has been added to the new SchemaExplorer - # object with attributes from the extended schema - result = df_parser.attribute_exists(extended_csv_model_se, "Assay") + def test_get_nodes_descendants(self): + return - assert result + def test_get_nodes_display_names(self): + return - def test_get_base_schema_path(self): + def test_get_node_label(self): + return - base_schema_path = "/path/to/base_schema.jsonld" + def test_get_node_range(self): + return - # path to base schema is returned when base_schema is passed - result_path = df_parser._get_base_schema_path(base_schema=base_schema_path) + def test_get_node_required(self): + return - assert result_path == "/path/to/base_schema.jsonld" + def test_get_node_validation_rules(self): + return - # path to default BioThings data model is returned when no - # base schema path is passed explicitly - biothings_path = df_parser._get_base_schema_path() + def test_get_subgraph_by_edge_type(self): + return - assert os.path.basename(biothings_path) == "biothings.model.jsonld" + def test_find_adjacent_child_classes(self): + return - def test_convert_csv_to_data_model(self, helpers, extended_schema_path): - - csv_path = helpers.get_data_path("example.model.jsonld") - - extended_csv_model_path = helpers.get_data_path(extended_schema_path) - - # convert extended CSV data model to JSON-LD using provided - # CSV data model as base schema - extended_csv_model_se = df_parser._convert_csv_to_data_model( - extended_csv_model_path, csv_path - ) - - # if new attribute can be found in extended_csv_model_se - # we know the conversion was successful - attribute_present = df_parser.attribute_exists(extended_csv_model_se, "Assay") - - assert attribute_present - - def test_get_property_label_from_display_name(self, helpers): - se_obj = helpers.get_schema_explorer("example.model.jsonld") - - # tests where strict_camel_case is the same - assert(se_obj.get_property_label_from_display_name("howToAcquire") == "howToAcquire") - assert(se_obj.get_property_label_from_display_name("howToAcquire", strict_camel_case = True) == "howToAcquire") - assert(se_obj.get_property_label_from_display_name("how_to_acquire") == "howToAcquire") - assert(se_obj.get_property_label_from_display_name("how_to_acquire", strict_camel_case = True) == "howToAcquire") - assert(se_obj.get_property_label_from_display_name("howtoAcquire") == "howtoAcquire") - assert(se_obj.get_property_label_from_display_name("howtoAcquire", strict_camel_case = True) == "howtoAcquire") - assert(se_obj.get_property_label_from_display_name("How To Acquire") == "howToAcquire") - assert(se_obj.get_property_label_from_display_name("How To Acquire", strict_camel_case = True) == "howToAcquire") - assert(se_obj.get_property_label_from_display_name("Model Of Manifestation") == "modelOfManifestation") - assert(se_obj.get_property_label_from_display_name("Model Of Manifestation", strict_camel_case = True) == "modelOfManifestation") - assert(se_obj.get_property_label_from_display_name("ModelOfManifestation") == "modelOfManifestation") - assert(se_obj.get_property_label_from_display_name("ModelOfManifestation", strict_camel_case = True) == "modelOfManifestation") - assert(se_obj.get_property_label_from_display_name("model Of Manifestation") == "modelOfManifestation") - assert(se_obj.get_property_label_from_display_name("model Of Manifestation", strict_camel_case = True) == "modelOfManifestation") - - # tests where strict_camel_case changes the result - assert(se_obj.get_property_label_from_display_name("how to Acquire") == "howtoAcquire") - assert(se_obj.get_property_label_from_display_name("how to Acquire", strict_camel_case = True) == "howToAcquire") - assert(se_obj.get_property_label_from_display_name("How to Acquire") == "howtoAcquire") - assert(se_obj.get_property_label_from_display_name("How to Acquire", strict_camel_case = True) == "howToAcquire") - assert(se_obj.get_property_label_from_display_name("how to acquire") == "howtoacquire") - assert(se_obj.get_property_label_from_display_name("how to acquire", strict_camel_case = True) == "howToAcquire") - assert(se_obj.get_property_label_from_display_name("model of manifestation") == "modelofmanifestation") - assert(se_obj.get_property_label_from_display_name("model of manifestation", strict_camel_case = True) == "modelOfManifestation") - assert(se_obj.get_property_label_from_display_name("model of manifestation") == "modelofmanifestation") - assert(se_obj.get_property_label_from_display_name("model of manifestation", strict_camel_case = True) == "modelOfManifestation") - - def test_get_class_label_from_display_name(self, helpers): - se_obj = helpers.get_schema_explorer("example.model.jsonld") - - # tests where strict_camel_case is the same - assert(se_obj.get_class_label_from_display_name("howToAcquire") == "HowToAcquire") - assert(se_obj.get_class_label_from_display_name("howToAcquire", strict_camel_case = True) == "HowToAcquire") - assert(se_obj.get_class_label_from_display_name("how_to_acquire") == "HowToAcquire") - assert(se_obj.get_class_label_from_display_name("how_to_acquire", strict_camel_case = True) == "HowToAcquire") - assert(se_obj.get_class_label_from_display_name("howtoAcquire") == "HowtoAcquire") - assert(se_obj.get_class_label_from_display_name("howtoAcquire", strict_camel_case = True) == "HowtoAcquire") - assert(se_obj.get_class_label_from_display_name("How To Acquire") == "HowToAcquire") - assert(se_obj.get_class_label_from_display_name("How To Acquire", strict_camel_case = True) == "HowToAcquire") - assert(se_obj.get_class_label_from_display_name("Model Of Manifestation") == "ModelOfManifestation") - assert(se_obj.get_class_label_from_display_name("Model Of Manifestation", strict_camel_case = True) == "ModelOfManifestation") - assert(se_obj.get_class_label_from_display_name("ModelOfManifestation") == "ModelOfManifestation") - assert(se_obj.get_class_label_from_display_name("ModelOfManifestation", strict_camel_case = True) == "ModelOfManifestation") - assert(se_obj.get_class_label_from_display_name("model Of Manifestation") == "ModelOfManifestation") - assert(se_obj.get_class_label_from_display_name("model Of Manifestation", strict_camel_case = True) == "ModelOfManifestation") - - # tests where strict_camel_case changes the result - assert(se_obj.get_class_label_from_display_name("how to Acquire") == "HowtoAcquire") - assert(se_obj.get_class_label_from_display_name("how to Acquire", strict_camel_case = True) == "HowToAcquire") - assert(se_obj.get_class_label_from_display_name("How to Acquire") == "HowtoAcquire") - assert(se_obj.get_class_label_from_display_name("How to Acquire", strict_camel_case = True) == "HowToAcquire") - assert(se_obj.get_class_label_from_display_name("how to acquire") == "Howtoacquire") - assert(se_obj.get_class_label_from_display_name("how to acquire", strict_camel_case = True) == "HowToAcquire") - assert(se_obj.get_class_label_from_display_name("model of manifestation") == "Modelofmanifestation") - assert(se_obj.get_class_label_from_display_name("model of manifestation", strict_camel_case = True) == "ModelOfManifestation") - assert(se_obj.get_class_label_from_display_name("model of manifestation") == "Modelofmanifestation") - assert(se_obj.get_class_label_from_display_name("model of manifestation", strict_camel_case = True) == "ModelOfManifestation") - -class TestSchemaExplorer: - @pytest.mark.parametrize("class_name, expected_in_schema", [("Patient",True), ("ptaient",False), ("Biospecimen",True), ("InvalidComponent",False)]) - def test_is_class_in_schema(self, sg, class_name, expected_in_schema): + def test_find_parent_classes(self): + return + + def test_full_schema_graph(self): + return + + @pytest.mark.parametrize( + "class_name, expected_in_schema", + [ + ("Patient", True), + ("ptaient", False), + ("Biospecimen", True), + ("InvalidComponent", False), + ], + ) + def test_is_class_in_schema(self, helpers, class_name, expected_in_schema): """ Test to cover checking if a given class is in a schema. `is_class_in_schema` should return `True` if the class is in the schema and `False` if it is not. """ - + DMGE = helpers.get_data_model_graph_explorer(path="example.model.csv") # Check if class is in schema - class_in_schema = sg.se.is_class_in_schema(class_name) + class_in_schema = DMGE.is_class_in_schema(class_name) # Assert value is as expected - assert class_in_schema == expected_in_schema \ No newline at end of file + assert class_in_schema == expected_in_schema + + def test_sub_schema_graph(self): + return + + +@pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) +) +class TestDataModelNodes: + def test_gather_nodes(self, helpers, data_model): + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model + ) + + # Parse Model + attr_rel_dictionary = data_model_parser.parse_model() + + # Instantiate DataModelNodes + data_model_nodes = generate_data_model_nodes( + helpers, data_model_name=data_model + ) + + attr_info = ("Patient", attr_rel_dictionary["Patient"]) + nodes = data_model_nodes.gather_nodes(attr_info=attr_info) + + # Make sure there are no repeat nodes + assert len(nodes) == len(set(nodes)) + + # Make sure the nodes returned conform to expectations (values and order) + ## The parsing records display names for relationships for CSV and labels for JSONLD, so the expectations are different between the two. + expected_nodes = [ + "Patient", + "Patient ID", + "Sex", + "Year of Birth", + "Diagnosis", + "Component", + "DataType", + ] + + assert nodes == expected_nodes + + # Ensure order is tested. + reordered_nodes = nodes.copy() + reordered_nodes.remove("Patient") + reordered_nodes.append("Patient") + assert reordered_nodes != expected_nodes + + def test_gather_all_nodes(self, helpers, data_model): + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model + ) + + # Parse Model + attr_rel_dictionary = data_model_parser.parse_model() + + # Instantiate DataModelNodes + data_model_nodes = generate_data_model_nodes( + helpers, data_model_name=data_model + ) + + all_nodes = data_model_nodes.gather_all_nodes_in_model( + attr_rel_dict=attr_rel_dictionary + ) + + # Make sure there are no repeat nodes + assert len(all_nodes) == len(set(all_nodes)) + + # Check that nodes from first entry, are recoreded in order in all_nodes + # Only check first entry, bc subsequent ones might be in the same order as would be gathered with gather_nodes if it contained a node that was already recorded. + first_attribute = list(attr_rel_dictionary.keys())[0] + attr_info = (first_attribute, attr_rel_dictionary[first_attribute]) + expected_starter_nodes = data_model_nodes.gather_nodes(attr_info=attr_info) + actual_starter_nodes = all_nodes[0 : len(expected_starter_nodes)] + + assert actual_starter_nodes == expected_starter_nodes + + def test_get_rel_node_dict_info(self, helpers, data_model, relationships): + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model + ) + + # Instantiate DataModelNodes + data_model_nodes = generate_data_model_nodes( + helpers, data_model_name=data_model + ) + + for relationship in relationships: + rel_dict_info = data_model_nodes.get_rel_node_dict_info(relationship) + if rel_dict_info: + assert type(rel_dict_info[0]) == str + assert type(rel_dict_info[1]) == dict + assert "default" in rel_dict_info[1].keys() + + def test_get_data_model_properties(self, helpers, data_model): + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model + ) + + # Parse Model + attr_rel_dictionary = data_model_parser.parse_model() + + # Instantiate DataModelNodes + data_model_nodes = generate_data_model_nodes( + helpers, data_model_name=data_model + ) + + # Get properties in the data model + data_model_properties = data_model_nodes.get_data_model_properties( + attr_rel_dictionary + ) + + # In the current example model, there are no properties, would need to update this section if properties are added. + assert data_model_properties == [] + + # Update the attr_rel_dictionary to add a property, then see if its found. + # Get a random relationship key from the attr_rel_dictionary: + all_keys = list(attr_rel_dictionary.keys()) + random_index = len(all_keys) - 1 + rel_key = all_keys[random.randint(0, random_index)] + + # Modify the contents of that relationship + attr_rel_dictionary[rel_key]["Relationships"]["Properties"] = ["TestProperty"] + + # Get properties in the modified data model + data_model_properties = data_model_nodes.get_data_model_properties( + attr_rel_dictionary + ) + + assert data_model_properties == ["TestProperty"] + + def test_get_entry_type(self, helpers, data_model): + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model + ) + + # Parse Model + attr_rel_dictionary = data_model_parser.parse_model() + + # Update the attr_rel_dictionary to add a property, then see if it is assigned the correct entry type. + # Get a random relationship key from the attr_rel_dictionary: + all_keys = list(attr_rel_dictionary.keys()) + random_index = len(all_keys) - 1 + rel_key = all_keys[random.randint(0, random_index)] + + # Modify the contents of that relationship + attr_rel_dictionary[rel_key]["Relationships"]["Properties"] = ["TestProperty"] + + # Instantiate DataModelNodes + # Note: Get entry type uses self, so I will have to instantiate DataModelNodes outside of the generate_data_model_nodes function + data_model_nodes = DataModelNodes( + attribute_relationships_dict=attr_rel_dictionary + ) + + # In the example data model all attributes should be classes. + for attr in attr_rel_dictionary.keys(): + entry_type = data_model_nodes.get_entry_type(attr) + assert entry_type == "class" + + # Check that the added property is properly loaded as a property + assert data_model_nodes.get_entry_type("TestProperty") == "property" + + @pytest.mark.parametrize( + "rel_func", list(REL_FUNC_DICT.values()), ids=list(REL_FUNC_DICT.keys()) + ) + @pytest.mark.parametrize( + "test_dn", list(TEST_DN_DICT.keys()), ids=list(TEST_DN_DICT.keys()) + ) + @pytest.mark.parametrize( + "test_bool", + ["True", "False", True, False, "kldjk"], + ids=["True_str", "False_str", "True_bool", "False_bool", "Random_str"], + ) + def test_run_rel_functions(self, helpers, data_model, rel_func, test_dn, test_bool): + # Call each relationship function to ensure that it is returning the desired result. + # Note all the called functions will also be tested in other unit tests. + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model + ) + + # Parse Model + attr_rel_dictionary = data_model_parser.parse_model() + + # Instantiate DataModelNodes + data_model_nodes = generate_data_model_nodes( + helpers, data_model_name=data_model + ) + + # Run functions the same way they are called in run_rel_functions: + if rel_func == get_attribute_display_name_from_label: + expected_display_names = list(attr_rel_dictionary.keys()) + returned_display_names = [ + data_model_nodes.run_rel_functions( + rel_func=get_attribute_display_name_from_label, + node_display_name=ndn, + attr_relationships=attr_rel_dictionary, + ) + for ndn in expected_display_names + ] + + assert expected_display_names == returned_display_names + + elif rel_func == parse_validation_rules: + # Find attributes with validation rules + # Gather Validation Rules + vrs = [] + for k, v in attr_rel_dictionary.items(): + if "Validation Rules" in v["Relationships"].keys(): + vrs.append(v["Relationships"]["Validation Rules"]) + parsed_vrs = [] + for attr in attr_rel_dictionary.keys(): + attr_relationships = attr_rel_dictionary[attr]["Relationships"] + if "Validation Rules" in attr_relationships: + parsed_vrs.append( + data_model_nodes.run_rel_functions( + rel_func=parse_validation_rules, + attr_relationships=attr_relationships, + csv_header="Validation Rules", + ) + ) + + assert len(vrs) == len(parsed_vrs) + if DATA_MODEL_DICT[data_model] == "CSV": + assert vrs != parsed_vrs + elif DATA_MODEL_DICT[data_model] == "JSONLD": + # JSONLDs already contain parsed validaiton rules so the raw vrs will match the parsed_vrs + assert vrs == parsed_vrs + + # For all validation rules where there are multiple rules, make sure they have been split as expected. + for i, pvr in enumerate(parsed_vrs): + delim_count = vrs[i][0].count("::") + if delim_count: + assert len(pvr) == delim_count + 1 + + elif rel_func == get_label_from_display_name: + # For a limited set check label is returned as expected. + for entry_type, expected_value in TEST_DN_DICT[test_dn].items(): + actual_value = data_model_nodes.run_rel_functions( + rel_func=get_label_from_display_name, + node_display_name=test_dn, + entry_type=entry_type, + ) + assert actual_value == expected_value + elif rel_func == convert_bool_to_str: + # return nothing if random string provided. + csv_header = "Required" + attr_relationships = {csv_header: test_bool} + actual_conversion = data_model_nodes.run_rel_functions( + rel_func=convert_bool_to_str, + csv_header=csv_header, + attr_relationships=attr_relationships, + ) + if "true" in str(test_bool).lower(): + assert actual_conversion == True + elif "false" in str(test_bool).lower(): + assert actual_conversion == False + else: + assert actual_conversion == None + else: + # If the function passed is not currently supported, should hit an error. + try: + data_model_nodes.run_rel_functions(rel_func=test_fake_func) + convert_worked = False + except: + convert_worked = True + assert convert_worked == True + return + + @pytest.mark.parametrize( + "node_display_name", + list(NODE_DISPLAY_NAME_DICT.keys()), + ids=[str(v) for v in NODE_DISPLAY_NAME_DICT.values()], + ) + def test_generate_node_dict(self, helpers, data_model, node_display_name): + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model + ) + + # Parse Model + attr_rel_dictionary = data_model_parser.parse_model() + + # Instantiate DataModelNodes + data_model_nodes = generate_data_model_nodes( + helpers, data_model_name=data_model + ) + + node_dict = data_model_nodes.generate_node_dict( + node_display_name=node_display_name, + attr_rel_dict=attr_rel_dictionary, + ) + + # Check that the output is as expected for the required key. + if NODE_DISPLAY_NAME_DICT[node_display_name]: + assert node_dict["required"] == True + else: + # Looking up this way, in case we add empty defaults back to JSONLD it wont fail, but will only be absent in JSONLD not CSV. + if not node_dict["required"] == False: + assert DATA_MODEL_DICT[data_model] == "JSONLD" + + def test_generate_node(self, helpers, data_model): + # Test adding a dummy node + node_dict = {"label": "test_label"} + + # Get Graph + graph_data_model = generate_graph_data_model( + helpers, data_model_name=data_model + ) + + # Instantiate DataModelNodes + data_model_nodes = generate_data_model_nodes( + helpers, data_model_name=data_model + ) + + # Assert the test node is not already in the graph + assert node_dict["label"] not in graph_data_model.nodes + + # Add test node + data_model_nodes.generate_node(graph_data_model, node_dict) + + # Check that the test node has been added + assert node_dict["label"] in graph_data_model.nodes + + +class TestDataModelEdges: + """ + Cases to test + Where node == attribute_display_name + Weights + domain includes weights + list weights + single element weights + Edges + subClassOf/domainIncludes relationship edge + any other relationship edge + rangeIncludes relationship edge + + """ + + def test_skip_edge(self, helpers, DMR, data_model_edges): + # Instantiate graph object and set node + G = nx.MultiDiGraph() + node = "Diagnosis" + + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name="validator_dag_test.model.csv" + ) + + # Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate data model Nodes object + DMN = DataModelNodes(parsed_data_model) + + # Get edge relationships and all nodes from the parsed model + edge_relationships = DMR.retreive_rel_headers_dict(edge=True) + all_nodes = DMN.gather_all_nodes_in_model(attr_rel_dict=parsed_data_model) + + # Sanity check to ensure that the node we intend to test exists in the data model + assert node in all_nodes + + # Add a single node to the graph + node_dict = {} + node_dict = DMN.generate_node_dict(node, parsed_data_model) + node_dict[node] = node_dict + G = DMN.generate_node(G, node_dict) + + # Check the edges in the graph, there should be none + before_edges = deepcopy(G.edges) + + edge_list = [] + # Generate an edge in the graph with one node and a subset of the parsed data model + # We're attempting to add an edge for a node that is the only one in the graph, + # so `generate_edge` should skip adding edges and return the same graph + edge_list_2 = data_model_edges.generate_edge( + node, node_dict, {node: parsed_data_model[node]}, edge_relationships, edge_list, + ) + + for node_1, node_2, edge_dict in edge_list_2: + G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight']) + + # Assert that no edges were added and that the current graph edges are the same as before the call to `generate_edge` + assert before_edges == G.edges + + @pytest.mark.parametrize( + "node_to_add, edge_relationship", + [ + ("DataType", "parentOf"), + ("Female", "parentOf"), + ("Sex", "requiresDependency"), + ], + ids=["subClassOf", "Valid Value", "all others"], + ) + def test_generate_edge( + self, helpers, DMR, data_model_edges, node_to_add, edge_relationship + ): + # Instantiate graph object + G = nx.MultiDiGraph() + + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name="validator_dag_test.model.csv" + ) + + # Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate data model Nodes object + DMN = DataModelNodes(parsed_data_model) + + # Get edge relationships and all nodes from the parsed model + edge_relationships = DMR.retreive_rel_headers_dict(edge=True) + all_nodes = DMN.gather_all_nodes_in_model(attr_rel_dict=parsed_data_model) + + # Sanity check to ensure that the node we intend to test exists in the data model + assert node_to_add in all_nodes + + # Add all nodes to the graph + all_node_dict = {} + for node in all_nodes: + node_dict = DMN.generate_node_dict(node, parsed_data_model) + all_node_dict[node] = node_dict + G = DMN.generate_node(G, node_dict) + + # Check the edges in the graph, there should be none + before_edges = deepcopy(G.edges) + + edge_list = [] + + # Generate edges for whichever node we are testing + edge_list_2 = data_model_edges.generate_edge( + node_to_add, all_node_dict, parsed_data_model, edge_relationships, edge_list, + ) + + for node_1, node_2, edge_dict in edge_list_2: + G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight']) + + # Assert that the current edges are different from the edges of the graph before + assert G.edges > before_edges + + # Assert that somewhere in the current edges for the node we added, that the correct relationship exists + relationship_df = pd.DataFrame(G.edges, columns=["node1", "node2", "edge"]) + assert (relationship_df["edge"] == edge_relationship).any() + + @pytest.mark.parametrize( + "node_to_add, other_node, expected_weight, data_model_path", + [ + ("Patient ID", "Biospecimen", 1, "validator_dag_test.model.csv"), + ("dataset_id", "cohorts", -1, "properties.test.model.csv"), + ], + ids=["list", "domainIncludes"], + ) + def test_generate_weights( + self, + helpers, + DMR, + data_model_edges, + node_to_add, + other_node, + expected_weight, + data_model_path, + ): + # Instantiate graph object + G = nx.MultiDiGraph() + + # Instantiate Parser + data_model_parser = get_data_model_parser( + helpers=helpers, data_model_name=data_model_path + ) + + # Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate data model Nodes object + DMN = DataModelNodes(parsed_data_model) + + # Get edge relationships and all nodes from the parsed model + edge_relationships = DMR.retreive_rel_headers_dict(edge=True) + all_nodes = DMN.gather_all_nodes_in_model(attr_rel_dict=parsed_data_model) + + # Sanity check to ensure that the node we intend to test exists in the data model + assert node_to_add in all_nodes + + # Add all nodes to the graph + all_node_dict = {} + for node in all_nodes: + node_dict = DMN.generate_node_dict(node, parsed_data_model) + all_node_dict[node] = node_dict + G = DMN.generate_node(G, node_dict) + + # Check the edges in the graph, there should be none + before_edges = deepcopy(G.edges) + + edge_list = [] + + # Generate edges for whichever node we are testing + edge_list_2 = data_model_edges.generate_edge( + node_to_add, all_node_dict, parsed_data_model, edge_relationships, edge_list, + ) + + for node_1, node_2, edge_dict in edge_list_2: + G.add_edge(node_1, node_2, key=edge_dict['key'], weight=edge_dict['weight']) + + # Assert that the current edges are different from the edges of the graph before + assert G.edges > before_edges + + # Cast the edges and weights to a DataFrame for easier indexing + edges_and_weights = pd.DataFrame( + G.edges.data(), columns=["node1", "node2", "weights"] + ).set_index("node1") + + # Weights are set to a negative nubmer to indicate that the weight cannot be known reliably beforehand and must be determined by reading the schema + # Get the index of the property in the schema + # Weights for properties are determined by their order in the schema. + # This would allow the tests to continue to function correctly in the case were other attributes were added to the schema + if expected_weight < 0: + schema = helpers.get_data_frame( + path=helpers.get_data_path(data_model_path), data_model=True + ) + expected_weight = schema.index[schema["Attribute"] == other_node][0] + logger.debug( + f"Expected weight for the edge of nodes {node_to_add} and {other_node} is {expected_weight}." + ) + + # Assert that the weight added is what is expected + if node_to_add in ["Patient ID"]: + assert ( + edges_and_weights.loc[other_node, "weights"]["weight"] + == expected_weight + ) + elif node_to_add in ["cohorts"]: + assert ( + edges_and_weights.loc[node_to_add, "weights"]["weight"] + == expected_weight + ) + + +@pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) +) +class TestDataModelJsonSchema: + @pytest.mark.parametrize( + "node_range", + [[], ["healthy"], ["healthy", "cancer"]], + ids=["empty_range", "single_range", "multi_range"], + ) + @pytest.mark.parametrize( + "node_name", ["", "Diagnosis"], ids=["empty_node_name", "Diagnosis_node_name"] + ) + @pytest.mark.parametrize("blank", [True, False], ids=["True_blank", "False_blank"]) + def test_get_array_schema(self, helpers, data_model, node_range, node_name, blank): + dmjs = get_data_model_json_schema(helpers=helpers, data_model_name=data_model) + array_schema = dmjs.get_array_schema( + node_range=node_range, node_name=node_name, blank=blank + ) + + # check node_name is recoreded as the key to the array schema + assert node_name in array_schema + + # Check maxItems is the lenghth of node_range + assert len(node_range) == array_schema[node_name]["maxItems"] + + # Check that blank value is added at the end of node_range, if true + if blank: + assert array_schema[node_name]["items"]["enum"][-1] == "" + assert len(array_schema[node_name]["items"]["enum"]) == len(node_range) + 1 + else: + assert array_schema[node_name]["items"]["enum"] == node_range + assert len(array_schema[node_name]["items"]["enum"]) == len(node_range) + + @pytest.mark.parametrize( + "node_name", ["", "Diagnosis"], ids=["empty_node_name", "Diagnosis_node_name"] + ) + def test_get_non_blank_schema(self, helpers, data_model, node_name): + dmjs = get_data_model_json_schema(helpers=helpers, data_model_name=data_model) + non_blank_schema = dmjs.get_non_blank_schema(node_name=node_name) + # check node_name is recoreded as the key to the array schema + assert node_name in non_blank_schema + assert non_blank_schema[node_name] == {"not": {"type": "null"}, "minLength": 1} + + @pytest.mark.parametrize( + "node_range", + [[], ["healthy"], ["healthy", "cancer"]], + ids=["empty_range", "single_range", "multi_range"], + ) + @pytest.mark.parametrize( + "node_name", ["", "Diagnosis"], ids=["empty_node_name", "Diagnosis_node_name"] + ) + @pytest.mark.parametrize("blank", [True, False], ids=["True_blank", "False_blank"]) + def test_get_range_schema(self, helpers, data_model, node_range, node_name, blank): + dmjs = get_data_model_json_schema(helpers=helpers, data_model_name=data_model) + range_schema = dmjs.get_range_schema( + node_range=node_range, node_name=node_name, blank=blank + ) + + # check node_name is recoreded as the key to the array schema + assert node_name in range_schema + + # Check that blank value is added at the end of node_range, if true + if blank: + assert range_schema[node_name]["enum"][-1] == "" + assert len(range_schema[node_name]["enum"]) == len(node_range) + 1 + else: + assert range_schema[node_name]["enum"] == node_range + assert len(range_schema[node_name]["enum"]) == len(node_range) + + @pytest.mark.parametrize( + "source_node", ["", "Patient"], ids=["empty_node_name", "patient_source"] + ) + @pytest.mark.parametrize( + "schema_name", + ["", "Test_Schema_Name"], + ids=["empty_schema_name", "schema_name"], + ) + def test_get_json_validation_schema( + self, helpers, data_model, source_node, schema_name + ): + dmjs = get_data_model_json_schema(helpers=helpers, data_model_name=data_model) + + try: + # Get validation schema + json_validation_schema = dmjs.get_json_validation_schema( + source_node=source_node, schema_name=schema_name + ) + + # Check Keys in Schema + expected_jvs_keys = [ + "$schema", + "$id", + "title", + "type", + "properties", + "required", + "allOf", + ] + actual_jvs_keys = list(json_validation_schema.keys()) + assert expected_jvs_keys == actual_jvs_keys + + # Check title + assert schema_name == json_validation_schema["title"] + + # Check contents of validation schema + assert "Diagnosis" in json_validation_schema["properties"] + assert "Cancer" in json_validation_schema["properties"]["Diagnosis"]["enum"] + except: + # Should only fail if no source node is provided. + assert source_node == "" + + +class TestDataModelJsonLd: + @pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) + ) + def test_init(self, helpers, data_model): + # Test that __init__ is being set up properly + # Get Graph + graph_data_model = generate_graph_data_model( + helpers, data_model_name=data_model + ) + + # Instantiate DataModelJsonLD + data_model_jsonld = DataModelJsonLD(Graph=graph_data_model) + + # Test that __init__ is being set up properly + assert type(data_model_jsonld.graph) == nx.MultiDiGraph + assert type(data_model_jsonld.rel_dict) == dict + assert "required" in data_model_jsonld.rel_dict + assert type(data_model_jsonld.dmge) == DataModelGraphExplorer + assert data_model_jsonld.output_path == "" + + def test_base_jsonld_template(self, helpers): + # Gather the templates + base_template = BaseTemplate() + base_jsonld_template = json.loads(base_template.to_json()) + + # Test base template is constructed as expected + assert "@context" in base_jsonld_template + assert "@graph" in base_jsonld_template + assert "@id" in base_jsonld_template + + def test_property_template(self, helpers): + # Get Property Template + empty_template = PropertyTemplate() + property_template = json.loads(empty_template.to_json()) + + expected_property_template = { + "@id": "", + "@type": "rdf:Property", + "rdfs:comment": "", + "rdfs:label": "", + "schema:domainIncludes": [], + "schema:rangeIncludes": [], + "schema:isPartOf": {}, + "sms:displayName": "", + "sms:required": "sms:false", + "sms:validationRules": [], + } + assert property_template == expected_property_template + + def test_class_template(self, helpers): + # Get Class Template + empty_template = ClassTemplate() + class_template = json.loads(empty_template.to_json()) + + expected_class_template = { + "@id": "", + "@type": "rdfs:Class", + "rdfs:comment": "", + "rdfs:label": "", + "rdfs:subClassOf": [], + "schema:isPartOf": {}, + "schema:rangeIncludes": [], + "sms:displayName": "", + "sms:required": "sms:false", + "sms:requiresDependency": [], + "sms:requiresComponent": [], + "sms:validationRules": [], + } + assert class_template == expected_class_template + + @pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) + ) + @pytest.mark.parametrize( + "template_type", ["property", "class"], ids=["property", "class"] + ) + @pytest.mark.parametrize("node", ["", "Patient"], ids=["no node", "Patient"]) + def test_fill_entry_template(self, helpers, data_model, template_type, node): + # Get Graph + graph_data_model = generate_graph_data_model( + helpers, data_model_name=data_model + ) + + # Instantiate DataModelJsonLD + data_model_jsonld = DataModelJsonLD(Graph=graph_data_model) + + # Get empty template + if template_type == "property": + property_template = PropertyTemplate() + template = json.loads(property_template.to_json()) + elif template_type == "class": + class_template = ClassTemplate() + template = json.loads(class_template.to_json()) + + # Make a copy of the template, since template is mutable + template_copy = deepcopy(template) + + try: + # Fill out template for given node. + object_template = data_model_jsonld.fill_entry_template( + template=template_copy, node=node + ) + # Ensure template keys are present (not all original keys will be present due to cleaning empty values): + except: + # Should only fail if no node is given + assert node == "" + + if "object_template" in locals(): + # Check that object template keys match the expected keys + actual_keys = list(object_template.keys()) + if template_type == "property": + expected_keys = [ + "@id", + "@type", + "rdfs:comment", + "rdfs:label", + "schema:isPartOf", + "sms:displayName", + "sms:required", + "sms:validationRules", + ] + elif template_type == "class": + expected_keys = [ + "@id", + "@type", + "rdfs:comment", + "rdfs:label", + "rdfs:subClassOf", + "schema:isPartOf", + "sms:displayName", + "sms:required", + "sms:requiresDependency", + "sms:validationRules", + ] + assert (set(actual_keys) - set(expected_keys)) == ( + set(expected_keys) - set(actual_keys) + ) + + @pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) + ) + @pytest.mark.parametrize( + "template_type", ["property", "class"], ids=["property", "class"] + ) + def test_add_contexts_to_entries(self, helpers, data_model, template_type): + # Will likely need to change when contexts added to model. + # Get Graph + graph_data_model = generate_graph_data_model( + helpers, data_model_name=data_model + ) + + # Instantiate DataModelJsonLD + data_model_jsonld = DataModelJsonLD(Graph=graph_data_model) + + # Get empty template + if template_type == "property": + property_template = PropertyTemplate() + template = json.loads(property_template.to_json()) + elif template_type == "class": + class_template = ClassTemplate() + template = json.loads(class_template.to_json()) + + # Make a copy of the template, since template is mutable + template_copy = deepcopy(template) + + # Fill out template for given node. + object_template = data_model_jsonld.fill_entry_template( + template=template_copy, node="Patient" + ) + + if "sms:required" in object_template: + assert "sms" in object_template["sms:required"] + if "@id" in object_template: + assert "bts" in object_template["@id"] + + @pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) + ) + def test_clean_template( + self, helpers, data_model: str, DMR: DataModelRelationships + ): + # TODO: This will need to change with contexts bc they are hard coded here. + # Get Graph + graph_data_model = generate_graph_data_model( + helpers, data_model_name=data_model + ) + + # Instantiate DataModelJsonLD + data_model_jsonld = DataModelJsonLD(Graph=graph_data_model) + + # Get empty template + class_template = ClassTemplate() + template = json.loads(class_template.to_json()) + + # Make a copy of the template, since template is mutable + template_copy = deepcopy(template) + + assert "sms:requiresDependency" in template_copy + + # Fill out some mock entries in the template: + template_copy["@id"] == "bts:CheckURL" + template_copy["rdfs:label"] == "CheckURL" + data_model_relationships = DMR.relationships_dictionary + + # Clean template + data_model_jsonld.clean_template( + template=template_copy, data_model_relationships=data_model_relationships + ) + + # Look for expected changes after cleaning + # Check that expected JSONLD default is added + assert template_copy["sms:required"] == "sms:false" + assert template_copy["sms:validationRules"] == [] + + # Check that non-required JSONLD keys are removed. + assert "sms:requiresDependency" not in template_copy + + @pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) + ) + @pytest.mark.parametrize( + "valid_values", + [[], ["Other", "Female", "Male"], ["A", "Bad", "Entry"]], + ids=["Empty List", "Disordered List", "Incorrect List"], + ) + def test_reorder_template_entries(self, helpers, data_model, valid_values): + # Note the way test_reorder_template_entries works, is that as long as an entry has recordings in the template + # even if they are incorrect, they will be corrected within this function. + # Get Graph + graph_data_model = generate_graph_data_model( + helpers, data_model_name=data_model + ) + + # Instantiate DataModelJsonLD + data_model_jsonld = DataModelJsonLD(Graph=graph_data_model) + + # Get empty template + class_template = ClassTemplate() + template = json.loads(class_template.to_json()) + + # Make a copy of the template, since template is mutable + template_copy = deepcopy(template) + + # Fill out template with 'Sex' attribute from example model + template_copy["@id"] = "Sex" + template_copy["rdfs:label"] = "Sex" + template_copy["sms:required"] = "sms:false" + template_copy["schema:rangeIncludes"] = valid_values + + # Now reorder: + data_model_jsonld.reorder_template_entries(template=template_copy) + if valid_values: + assert template_copy["schema:rangeIncludes"] == [ + {"@id": "bts:Female"}, + {"@id": "bts:Male"}, + {"@id": "bts:Other"}, + ] + else: + assert template_copy["schema:rangeIncludes"] == [] + + @pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) + ) + def test_generate_jsonld_object(self, helpers, data_model): + # Check that JSONLD object is being made, and has some populated entries. + + # Get Graph + graph_data_model = generate_graph_data_model( + helpers, data_model_name=data_model + ) + + # Instantiate DataModelJsonLD + data_model_jsonld = DataModelJsonLD(Graph=graph_data_model) + jsonld_dm = data_model_jsonld.generate_jsonld_object() + + assert list(jsonld_dm.keys()) == ["@context", "@graph", "@id"] + assert len(jsonld_dm["@graph"]) > 1 + + @pytest.mark.parametrize( + "data_model", list(DATA_MODEL_DICT.keys()), ids=list(DATA_MODEL_DICT.values()) + ) + def test_convert_graph_to_jsonld(self, helpers, data_model): + # Get Graph + graph_data_model = generate_graph_data_model( + helpers, data_model_name=data_model + ) + + # Generate JSONLD + jsonld_dm = convert_graph_to_jsonld(Graph=graph_data_model) + assert list(jsonld_dm.keys()) == ["@context", "@graph", "@id"] + assert len(jsonld_dm["@graph"]) > 1 diff --git a/tests/test_store.py b/tests/test_store.py index 8f66ade50..ed8990579 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -13,9 +13,11 @@ from synapseclient.core.exceptions import SynapseHTTPError from synapseclient.entity import File -from schematic.configuration.configuration import Configuration +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_relationships import DataModelRelationships + from schematic.models.metadata import MetadataModel -from schematic.schemas.generator import SchemaGenerator from schematic.store.base import BaseStorage from schematic.store.synapse import (DatasetFileView, ManifestDownload,) @@ -144,10 +146,24 @@ def test_get_file_entityIds(self, helpers, synapse_store, only_new_files): 'file-based']) def test_annotation_submission(self, synapse_store, helpers, manifest_path, test_annotations, datasetId, manifest_record_type, config: Configuration): # Upload dataset annotations - sg = SchemaGenerator(config.model_location) + + # Instantiate DataModelParser + data_model_parser = DataModelParser(path_to_data_model = config.model_location) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + # Instantiate DataModelGraphExplorer + dmge = DataModelGraphExplorer(graph_data_model) manifest_id = synapse_store.associateMetadataWithFiles( - schemaGenerator = sg, + dmge = dmge, metadataManifestPath = helpers.get_data_path(manifest_path), datasetId = datasetId, manifest_record_type = manifest_record_type, @@ -380,11 +396,25 @@ def test_createTable(self, helpers, synapse_store, config: Configuration, projec # associate metadata with files manifest_path = "mock_manifests/table_manifest.csv" inputModelLocaiton = helpers.get_data_path(os.path.basename(config.model_location)) - sg = SchemaGenerator(inputModelLocaiton) + + # Instantiate DataModelParser + data_model_parser = DataModelParser(path_to_data_model = inputModelLocaiton) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + # Instantiate DataModelGraphExplorer + dmge = DataModelGraphExplorer(graph_data_model) # updating file view on synapse takes a long time manifestId = synapse_store.associateMetadataWithFiles( - schemaGenerator = sg, + dmge = dmge, metadataManifestPath = helpers.get_data_path(manifest_path), datasetId = datasetId, manifest_record_type = 'table_and_file', @@ -419,11 +449,24 @@ def test_replaceTable(self, helpers, synapse_store, config: Configuration, proje # associate org FollowUp metadata with files inputModelLocaiton = helpers.get_data_path(os.path.basename(config.model_location)) - sg = SchemaGenerator(inputModelLocaiton) + #sg = SchemaGenerator(inputModelLocaiton) - # updating file view on synapse takes a long time + data_model_parser = DataModelParser(path_to_data_model = inputModelLocaiton) + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + # Instantiate DataModelGraphExplorer + dmge = DataModelGraphExplorer(graph_data_model) + + # updating file view on synapse takes a long time manifestId = synapse_store.associateMetadataWithFiles( - schemaGenerator = sg, + dmge = dmge, metadataManifestPath = helpers.get_data_path(manifest_path), datasetId = datasetId, manifest_record_type = 'table_and_file', @@ -445,7 +488,7 @@ def test_replaceTable(self, helpers, synapse_store, config: Configuration, proje # Associate replacement manifest with files manifestId = synapse_store.associateMetadataWithFiles( - schemaGenerator = sg, + dmge = dmge, metadataManifestPath = helpers.get_data_path(replacement_manifest_path), datasetId = datasetId, manifest_record_type = 'table_and_file', @@ -486,11 +529,23 @@ def test_upsertTable(self, helpers, synapse_store, config:Configuration, project # associate org FollowUp metadata with files inputModelLocaiton = helpers.get_data_path(os.path.basename(config.model_location)) - sg = SchemaGenerator(inputModelLocaiton) + + data_model_parser = DataModelParser(path_to_data_model = inputModelLocaiton) + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + # Instantiate DataModelGraphExplorer + dmge = DataModelGraphExplorer(graph_data_model) # updating file view on synapse takes a long time manifestId = synapse_store.associateMetadataWithFiles( - schemaGenerator = sg, + dmge = dmge, metadataManifestPath = helpers.get_data_path(manifest_path), datasetId = datasetId, manifest_record_type = 'table_and_file', @@ -516,7 +571,7 @@ def test_upsertTable(self, helpers, synapse_store, config:Configuration, project # Associate new manifest with files manifestId = synapse_store.associateMetadataWithFiles( - schemaGenerator = sg, + dmge = dmge, metadataManifestPath = helpers.get_data_path(replacement_manifest_path), datasetId = datasetId, manifest_record_type = 'table_and_file', @@ -604,8 +659,3 @@ def test_entity_type_checking(self, synapse_store, entity_id, caplog): if entity_id == "syn27600053": for record in caplog.records: assert "You are using entity type: folder. Please provide a file ID" in record.message - - - - - diff --git a/tests/test_utils.py b/tests/test_utils.py index d8f0aff46..d9f06c845 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,3 +1,4 @@ +import copy import json import logging import os @@ -15,16 +16,32 @@ from pandas.testing import assert_frame_equal from synapseclient.core.exceptions import SynapseHTTPError +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_jsonld import DataModelJsonLD, BaseTemplate, PropertyTemplate, ClassTemplate +from schematic.schemas.data_model_json_schema import DataModelJSONSchema + +from schematic.schemas.data_model_relationships import DataModelRelationships +from schematic.schemas.data_model_jsonld import DataModelJsonLD, convert_graph_to_jsonld + +from schematic.exceptions import ( + MissingConfigValueError, + MissingConfigAndArgumentValueError, +) from schematic import LOADER from schematic.exceptions import (MissingConfigAndArgumentValueError, MissingConfigValueError) -from schematic.schemas import df_parser -from schematic.schemas.explorer import SchemaExplorer + from schematic.utils import (cli_utils, df_utils, general, io_utils, validate_utils) from schematic.utils.general import (calculate_datetime, check_synapse_cache_size, clear_synapse_cache, entity_type_mapping) +from schematic.utils.schema_utils import (export_schema, + get_property_label_from_display_name, + get_class_label_from_display_name, + strip_context) + logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -316,46 +333,134 @@ def test_populate_column(self): output_df = df_utils.populate_df_col_with_another_col(input_df,'column1','column2') assert (output_df["column2"].values == ["col1Val","col1Val"]).all() +class TestSchemaUtils: + def test_get_property_label_from_display_name(self, helpers): + + # tests where strict_camel_case is the same + assert(get_property_label_from_display_name("howToAcquire") == "howToAcquire") + assert(get_property_label_from_display_name("howToAcquire", strict_camel_case = True) == "howToAcquire") + assert(get_property_label_from_display_name("how_to_acquire") == "howToAcquire") + assert(get_property_label_from_display_name("how_to_acquire", strict_camel_case = True) == "howToAcquire") + assert(get_property_label_from_display_name("howtoAcquire") == "howtoAcquire") + assert(get_property_label_from_display_name("howtoAcquire", strict_camel_case = True) == "howtoAcquire") + assert(get_property_label_from_display_name("How To Acquire") == "howToAcquire") + assert(get_property_label_from_display_name("How To Acquire", strict_camel_case = True) == "howToAcquire") + assert(get_property_label_from_display_name("Model Of Manifestation") == "modelOfManifestation") + assert(get_property_label_from_display_name("Model Of Manifestation", strict_camel_case = True) == "modelOfManifestation") + assert(get_property_label_from_display_name("ModelOfManifestation") == "modelOfManifestation") + assert(get_property_label_from_display_name("ModelOfManifestation", strict_camel_case = True) == "modelOfManifestation") + assert(get_property_label_from_display_name("model Of Manifestation") == "modelOfManifestation") + assert(get_property_label_from_display_name("model Of Manifestation", strict_camel_case = True) == "modelOfManifestation") + + # tests where strict_camel_case changes the result + assert(get_property_label_from_display_name("how to Acquire") == "howtoAcquire") + assert(get_property_label_from_display_name("how to Acquire", strict_camel_case = True) == "howToAcquire") + assert(get_property_label_from_display_name("How to Acquire") == "howtoAcquire") + assert(get_property_label_from_display_name("How to Acquire", strict_camel_case = True) == "howToAcquire") + assert(get_property_label_from_display_name("how to acquire") == "howtoacquire") + assert(get_property_label_from_display_name("how to acquire", strict_camel_case = True) == "howToAcquire") + assert(get_property_label_from_display_name("model of manifestation") == "modelofmanifestation") + assert(get_property_label_from_display_name("model of manifestation", strict_camel_case = True) == "modelOfManifestation") + assert(get_property_label_from_display_name("model of manifestation") == "modelofmanifestation") + assert(get_property_label_from_display_name("model of manifestation", strict_camel_case = True) == "modelOfManifestation") + + def test_get_class_label_from_display_name(self, helpers): + + # tests where strict_camel_case is the same + assert(get_class_label_from_display_name("howToAcquire") == "HowToAcquire") + assert(get_class_label_from_display_name("howToAcquire", strict_camel_case = True) == "HowToAcquire") + assert(get_class_label_from_display_name("how_to_acquire") == "HowToAcquire") + assert(get_class_label_from_display_name("how_to_acquire", strict_camel_case = True) == "HowToAcquire") + assert(get_class_label_from_display_name("howtoAcquire") == "HowtoAcquire") + assert(get_class_label_from_display_name("howtoAcquire", strict_camel_case = True) == "HowtoAcquire") + assert(get_class_label_from_display_name("How To Acquire") == "HowToAcquire") + assert(get_class_label_from_display_name("How To Acquire", strict_camel_case = True) == "HowToAcquire") + assert(get_class_label_from_display_name("Model Of Manifestation") == "ModelOfManifestation") + assert(get_class_label_from_display_name("Model Of Manifestation", strict_camel_case = True) == "ModelOfManifestation") + assert(get_class_label_from_display_name("ModelOfManifestation") == "ModelOfManifestation") + assert(get_class_label_from_display_name("ModelOfManifestation", strict_camel_case = True) == "ModelOfManifestation") + assert(get_class_label_from_display_name("model Of Manifestation") == "ModelOfManifestation") + assert(get_class_label_from_display_name("model Of Manifestation", strict_camel_case = True) == "ModelOfManifestation") + + # tests where strict_camel_case changes the result + assert(get_class_label_from_display_name("how to Acquire") == "HowtoAcquire") + assert(get_class_label_from_display_name("how to Acquire", strict_camel_case = True) == "HowToAcquire") + assert(get_class_label_from_display_name("How to Acquire") == "HowtoAcquire") + assert(get_class_label_from_display_name("How to Acquire", strict_camel_case = True) == "HowToAcquire") + assert(get_class_label_from_display_name("how to acquire") == "Howtoacquire") + assert(get_class_label_from_display_name("how to acquire", strict_camel_case = True) == "HowToAcquire") + assert(get_class_label_from_display_name("model of manifestation") == "Modelofmanifestation") + assert(get_class_label_from_display_name("model of manifestation", strict_camel_case = True) == "ModelOfManifestation") + assert(get_class_label_from_display_name("model of manifestation") == "Modelofmanifestation") + assert(get_class_label_from_display_name("model of manifestation", strict_camel_case = True) == "ModelOfManifestation") + + @pytest.mark.parametrize("context_value", ['@id', 'sms:required'], ids=['remove_at', 'remove_sms']) + def test_strip_context(self, helpers, context_value): + stripped_contex = strip_context(context_value=context_value) + if '@id' == context_value: + assert stripped_contex == ('', 'id') + elif 'sms:required' == context_value: + assert stripped_contex == ('sms', 'required') class TestValidateUtils: def test_validate_schema(self, helpers): - + ''' + Previously did: se_obj = helpers.get_schema_explorer("example.model.jsonld") - actual = validate_utils.validate_schema(se_obj.schema) + schema is defined as: self.schema = load_json(schema) + + TODO: Validate this is doing what its supposed to. + ''' + # Get data model path + data_model_path = helpers.get_data_path("example.model.jsonld") + schema = io_utils.load_json(data_model_path) + #need to pass the jsonschema + actual = validate_utils.validate_schema(schema) + assert actual is None + def test_validate_class_schema(self, helpers): + """ + Get a class template, fill it out with mock data, and validate against a JSON Schema - se_obj = helpers.get_schema_explorer("example.model.jsonld") + """ + class_template = ClassTemplate() + self.class_template = json.loads(class_template.to_json()) - mock_class = se_obj.generate_class_template() + mock_class = copy.deepcopy(self.class_template) mock_class["@id"] = "bts:MockClass" mock_class["@type"] = "rdfs:Class" mock_class["@rdfs:comment"] = "This is a mock class" mock_class["@rdfs:label"] = "MockClass" - mock_class["rdfs:subClassOf"]["@id"] = "bts:Patient" + mock_class["rdfs:subClassOf"].append({"@id":"bts:Patient"}) - actual = validate_utils.validate_class_schema(mock_class) + error = validate_utils.validate_class_schema(mock_class) - assert actual is None + assert error is None + def test_validate_property_schema(self, helpers): + """ + Get a property template, fill it out with mock data, and validate against a JSON Schema - se_obj = helpers.get_schema_explorer("example.model.jsonld") + """ + property_template = PropertyTemplate() + self.property_template = json.loads(property_template.to_json()) - mock_class = se_obj.generate_property_template() + mock_class = copy.deepcopy(self.property_template) mock_class["@id"] = "bts:MockProperty" mock_class["@type"] = "rdf:Property" mock_class["@rdfs:comment"] = "This is a mock Patient class" - mock_class["@rdfs:label"] = "MockProperty" - mock_class["schema:domainIncludes"]["@id"] = "bts:Patient" + mock_class["@rdfs:label"] = "MockProperty" + mock_class["schema:domainIncludes"].append({"@id":"bts:Patient"}) - actual = validate_utils.validate_property_schema(mock_class) - - assert actual is None + error = validate_utils.validate_property_schema(mock_class) + assert error is None + class TestCsvUtils: def test_csv_to_schemaorg(self, helpers, tmp_path): @@ -363,17 +468,32 @@ def test_csv_to_schemaorg(self, helpers, tmp_path): This test also ensures that the CSV and JSON-LD files for the example data model stay in sync. + TODO: This probably should be moved out of here and to test_schemas """ csv_path = helpers.get_data_path("example.model.csv") - base_se = df_parser._convert_csv_to_data_model(csv_path) + # Instantiate DataModelParser + data_model_parser = DataModelParser(path_to_data_model = csv_path) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + # Convert graph to JSONLD + jsonld_data_model = convert_graph_to_jsonld(Graph=graph_data_model) # saving updated schema.org schema actual_jsonld_path = tmp_path / "example.from_csv.model.jsonld" - base_se.export_schema(actual_jsonld_path) + export_schema(jsonld_data_model, actual_jsonld_path) # Compare both JSON-LD files expected_jsonld_path = helpers.get_data_path("example.model.jsonld") expected_jsonld = open(expected_jsonld_path).read() actual_jsonld = open(actual_jsonld_path).read() + assert expected_jsonld == actual_jsonld diff --git a/tests/test_validation.py b/tests/test_validation.py index 57a1c78e6..df1211cfb 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -1,6 +1,7 @@ import os import logging import re +import networkx as nx import jsonschema import pytest from pathlib import Path @@ -10,18 +11,19 @@ from schematic.models.validate_manifest import ValidateManifest from schematic.models.metadata import MetadataModel from schematic.store.synapse import SynapseStorage -from schematic.schemas.generator import SchemaGenerator + +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer +from schematic.schemas.data_model_json_schema import DataModelJSONSchema + from schematic.utils.validate_rules_utils import validation_rule_info logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) -@pytest.fixture -def sg(helpers): - - inputModelLocation = helpers.get_data_path('example.model.jsonld') - sg = SchemaGenerator(inputModelLocation) - - yield sg +@pytest.fixture(name="dmge") +def DMGE(helpers): + dmge = helpers.get_data_model_graph_explorer(path="example.model.jsonld") + yield dmge @pytest.fixture def metadataModel(helpers): @@ -61,7 +63,7 @@ def test_valid_manifest(self,helpers,metadataModel): assert warnings == [] - def test_invalid_manifest(self,helpers,sg,metadataModel): + def test_invalid_manifest(self,helpers, dmge,metadataModel): manifestPath = helpers.get_data_path("mock_manifests/Invalid_Test_Manifest.csv") rootNode = 'MockComponent' @@ -77,7 +79,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): row_num = '3', attribute_name = 'Check Num', invalid_entry = 'c', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -85,7 +87,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): row_num = '3', attribute_name = 'Check Int', invalid_entry = '5.63', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -93,7 +95,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): row_num = '3', attribute_name = 'Check String', invalid_entry = '94', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_list_error( @@ -103,7 +105,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): attribute_name = 'Check List', list_error = "not_comma_delimited", invalid_entry = 'invalid list values', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_list_error( @@ -113,7 +115,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): attribute_name = 'Check Regex List', list_error = "not_comma_delimited", invalid_entry = 'ab cd ef', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -123,7 +125,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): attribute_name = 'Check Regex Format', module_to_call = 'match', invalid_entry = 'm', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -133,7 +135,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): attribute_name = 'Check Regex Single', module_to_call = 'search', invalid_entry = 'q', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -143,7 +145,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): attribute_name = 'Check Regex Integer', module_to_call = 'search', invalid_entry = '5.4', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_url_error( @@ -154,14 +156,14 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): attribute_name = 'Check URL', argument = None, invalid_entry = 'http://googlef.com/', - sg = sg, + dmge = dmge, )[0] in errors date_err = GenerateError.generate_content_error( val_rule = 'date', attribute_name = 'Check Date', - sg = sg, + dmge = dmge, row_num = ['2','3','4'], error_val = ['84-43-094', '32-984', 'notADate'], )[0] @@ -171,7 +173,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): assert GenerateError.generate_content_error( val_rule = 'unique error', attribute_name = 'Check Unique', - sg = sg, + dmge = dmge, row_num = ['2','3','4'], error_val = ['str1'], )[0] in errors @@ -179,7 +181,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): assert GenerateError.generate_content_error( val_rule = 'inRange 50 100 error', attribute_name = 'Check Range', - sg = sg, + dmge = dmge, row_num = ['3'], error_val = ['30'], )[0] in errors @@ -188,13 +190,13 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): assert GenerateError.generate_content_error( val_rule = 'recommended', attribute_name = 'Check Recommended', - sg = sg, + dmge = dmge, )[1] in warnings assert GenerateError.generate_content_error( val_rule = 'protectAges', attribute_name = 'Check Ages', - sg = sg, + dmge = dmge, row_num = ['2','3'], error_val = ['6549','32851'], )[1] in warnings @@ -205,7 +207,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): attribute_name='Check Match at Least', invalid_entry = ['7163'], missing_manifest_ID = ['syn27600110', 'syn29381803'], - sg = sg, + dmge = dmge, )[1] in warnings assert GenerateError.generate_cross_warning( @@ -213,7 +215,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): row_num = ['3'], attribute_name = 'Check Match at Least values', invalid_entry = ['51100'], - sg = sg, + dmge = dmge, )[1] in warnings assert \ @@ -221,14 +223,14 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): val_rule = 'matchExactlyOne', attribute_name='Check Match Exactly', matching_manifests = ['syn29862078', 'syn27648165'], - sg = sg, + dmge = dmge, )[1] in warnings \ or \ GenerateError.generate_cross_warning( val_rule = 'matchExactlyOne', attribute_name='Check Match Exactly', matching_manifests = ['syn29862066', 'syn27648165'], - sg = sg, + dmge = dmge, )[1] in warnings @@ -237,7 +239,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): row_num = ['2', '3', '4'], attribute_name='Check Match Exactly values', invalid_entry = ['71738', '98085', '210065'], - sg = sg, + dmge = dmge, )[1] warning_in_list = [cross_warning[1] in warning for warning in warnings] assert any(warning_in_list) @@ -245,7 +247,7 @@ def test_invalid_manifest(self,helpers,sg,metadataModel): - def test_in_house_validation(self,helpers,sg,metadataModel): + def test_in_house_validation(self,helpers,dmge,metadataModel): manifestPath = helpers.get_data_path("mock_manifests/Invalid_Test_Manifest.csv") rootNode = 'MockComponent' @@ -262,7 +264,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): row_num = '3', attribute_name = 'Check Num', invalid_entry = 'c', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -270,7 +272,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): row_num = '3', attribute_name = 'Check Int', invalid_entry = '5.63', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -278,7 +280,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): row_num = '3', attribute_name = 'Check String', invalid_entry = '94', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_type_error( @@ -286,7 +288,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): row_num = '3', attribute_name = 'Check NA', invalid_entry = '9.5', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_list_error( @@ -296,7 +298,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): attribute_name = 'Check List', list_error = "not_comma_delimited", invalid_entry = 'invalid list values', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_list_error( @@ -306,7 +308,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): attribute_name = 'Check Regex List', list_error = "not_comma_delimited", invalid_entry = 'ab cd ef', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -316,7 +318,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): attribute_name = 'Check Regex Single', module_to_call = 'search', invalid_entry = 'q', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_regex_error( @@ -326,7 +328,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): attribute_name = 'Check Regex Format', module_to_call = 'match', invalid_entry = 'm', - sg = sg, + dmge = dmge, )[0] in errors assert GenerateError.generate_url_error( @@ -337,7 +339,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): attribute_name = 'Check URL', argument = None, invalid_entry = 'http://googlef.com/', - sg = sg, + dmge = dmge, )[0] in errors @@ -348,7 +350,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): attribute_name='Check Match at Least', invalid_entry = ['7163'], missing_manifest_ID = ['syn27600110', 'syn29381803'], - sg = sg, + dmge = dmge, )[1] in warnings assert GenerateError.generate_cross_warning( @@ -356,7 +358,7 @@ def test_in_house_validation(self,helpers,sg,metadataModel): row_num = ['3'], attribute_name = 'Check Match at Least values', invalid_entry = ['51100'], - sg = sg, + dmge = dmge, )[1] in warnings assert \ @@ -364,14 +366,14 @@ def test_in_house_validation(self,helpers,sg,metadataModel): val_rule = 'matchExactlyOne', attribute_name='Check Match Exactly', matching_manifests = ['syn29862078', 'syn27648165'], - sg = sg, + dmge = dmge, )[1] in warnings \ or \ GenerateError.generate_cross_warning( val_rule = 'matchExactlyOne', attribute_name='Check Match Exactly', matching_manifests = ['syn29862066', 'syn27648165'], - sg = sg, + dmge = dmge, )[1] in warnings assert GenerateError.generate_cross_warning( @@ -379,69 +381,73 @@ def test_in_house_validation(self,helpers,sg,metadataModel): row_num = ['2', '3', '4'], attribute_name='Check Match Exactly values', invalid_entry = ['71738', '98085', '210065'], - sg = sg, + dmge = dmge, )[1] in warnings @pytest.mark.rule_combos(reason = 'This introduces a great number of tests covering every possible rule combination that are only necessary on occasion.') @pytest.mark.parametrize("base_rule, second_rule", get_rule_combinations()) - def test_rule_combinations(self, helpers, sg, base_rule, second_rule, metadataModel,): + def test_rule_combinations(self, helpers, dmge, base_rule, second_rule, metadataModel): + """ + TODO: Describe what this test is doing. + Updating the data model graph to allow testing of allowable rule combinations. + Works one rule combo at a time using (get_rule_combinations.) + """ rule_regex = re.compile(base_rule+'.*') + rootNode = 'MockComponent' manifestPath = helpers.get_data_path("mock_manifests/Rule_Combo_Manifest.csv") manifest = helpers.get_data_frame(manifestPath) - - # adjust rules and arguments as necessary for testing combinations - for attribute in sg.se.schema['@graph']: #Doing it in a loop becasue of sg.se.edit_class design - if 'sms:validationRules' in attribute and attribute['sms:validationRules']: - # remove default combination for attribute's reules - if attribute['sms:displayName'] == 'Check NA': - attribute['sms:validationRules'].remove('int') - - # update class - sg.se.edit_class(attribute) + + # Get a view of the node data + all_node_data = dmge.graph.nodes.data() + + # Update select validation rules in the data model graph for columns in the manifest + for attribute in manifest.columns: + # Get the node label + node_label = dmge.get_node_label(attribute) + + # Get a view of the recorded info for current node + node_info = all_node_data[node_label] + if node_info['validationRules']: + + if node_info['displayName'] == 'Check NA': + # Edit the node info -in place- + node_info['validationRules'].remove('int') break - - # Add rule args if necessary - if base_rule in attribute['sms:validationRules'] or re.match(rule_regex, attribute['sms:validationRules'][0]): + + if base_rule in node_info['validationRules'] or re.match(rule_regex, node_info['validationRules'][0]): if second_rule.startswith('matchAtLeastOne') or second_rule.startswith('matchExactlyOne'): - rule_args = f" MockComponent.{attribute['rdfs:label']} Patient.PatientID" + rule_args = f" MockComponent.{node_label} Patient.PatientID" elif second_rule.startswith('inRange'): rule_args = ' 1 1000 warning' elif second_rule.startswith('regex'): rule_args = ' search [a-f]' else: rule_args = '' - - attribute['sms:validationRules'].append(second_rule + rule_args) - - # update class - sg.se.edit_class(attribute) + # Edit the node info -in place- + node_info['validationRules'].append(second_rule + rule_args) break - target_column=attribute['sms:displayName'] - for col in manifest.columns: - if col not in ('Component', target_column): - manifest.drop(columns=col, inplace=True) + # Update the manifest to only contain the Component and attribute column where the rule was changed. + manifest = manifest[['Component', attribute]] + + data_model_js = DataModelJSONSchema(jsonld_path=helpers.get_data_path('example.model.jsonld'), graph=dmge.graph) + json_schema = data_model_js.get_json_validation_schema(source_node=rootNode, schema_name=rootNode + "_validation") - rootNode = 'MockComponent' validateManifest = ValidateManifest( errors = [], manifest = manifest, manifestPath = manifestPath, - sg = sg, - jsonSchema = sg.get_json_schema_requirements(rootNode, rootNode + "_validation") + dmge = dmge, + jsonSchema = json_schema ) #perform validation with no exceptions raised _, errors, warnings = validateManifest.validate_manifest_rules( manifest = manifest, - sg = sg, + dmge = dmge, restrict_rules = False, project_scope = None, ) - - - - \ No newline at end of file diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 000000000..641fcccea --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,109 @@ +from io import StringIO +import json +import networkx as nx +import os +import pandas as pd +import pytest +import logging + + +from schematic.schemas.data_model_parser import DataModelParser +from schematic.schemas.data_model_graph import DataModelGraph +from schematic.schemas.data_model_validator import DataModelValidator +from schematic.schemas.data_model_jsonld import DataModelJsonLD, convert_graph_to_jsonld + + + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +def graph_data_model_func(helpers, data_model_name): + path_to_data_model = helpers.get_data_path(data_model_name) + + # Instantiate Parser + data_model_parser = DataModelParser(path_to_data_model=path_to_data_model) + + #Parse Model + parsed_data_model = data_model_parser.parse_model() + + # Convert parsed model to graph + # Instantiate DataModelGraph + data_model_grapher = DataModelGraph(parsed_data_model) + + # Generate graph + graph_data_model = data_model_grapher.generate_data_model_graph() + + return graph_data_model + + + +class TestDataModelValidator: + def test_check_blacklisted_characters(self, helpers): + # Get graph data model + graph_data_model = graph_data_model_func(helpers, data_model_name='validator_test.model.csv') + + # Instantiate Data Model Validator + DMV = DataModelValidator(graph_data_model) + + # Run validation + validator_errors = DMV.check_blacklisted_characters() + + # Expected Error + expected_error = ['Node: Patient) contains a blacklisted character(s): ), they will be striped if used in Synapse annotations.', + 'Node: Patient ID. contains a blacklisted character(s): ., they will be striped if used in Synapse annotations.', + 'Node: Sex- contains a blacklisted character(s): -, they will be striped if used in Synapse annotations.', + 'Node: Year of Birth( contains a blacklisted character(s): (, they will be striped if used in Synapse annotations.', + 'Node: Bulk RNA-seq Assay contains a blacklisted character(s): -, they will be striped if used in Synapse annotations.', + ] + + assert expected_error == validator_errors + + def test_check_reserved_names(self, helpers): + # Get graph data model + graph_data_model = graph_data_model_func(helpers, data_model_name='validator_test.model.csv') + + # Instantiate Data Model Validator + DMV = DataModelValidator(graph_data_model) + + # Run validation + validator_errors = DMV.check_reserved_names() + + # Expected Error + expected_error = ['Your data model entry name: EntityId overlaps with the reserved name: entityId. Please change this name in your data model.'] + assert expected_error == validator_errors + + def test_check_graph_has_required_node_fields(self, helpers): + # Get graph data model + graph_data_model = graph_data_model_func(helpers, data_model_name='validator_test.model.csv') + + # Remove a field from an entry graph + del graph_data_model.nodes['Cancer']['label'] + + # Instantiate Data Model Validator + DMV = DataModelValidator(graph_data_model) + + # Run validation + validator_errors = DMV.check_graph_has_required_node_fields() + + # Expected Error + expected_error = ['For entry: Cancer, the required field label is missing in the data model graph, please double check your model and generate the graph again.'] + assert expected_error == validator_errors + + def test_dag(self, helpers): + # TODO: The schema validator currently doesn't catch the Diagnosis-Diagnosis self loop. + # It is an expected error but it will need to be decided if the validator should prevent or allow such self loops + + # Get graph data model + graph_data_model = graph_data_model_func(helpers, data_model_name='validator_dag_test.model.csv') + + # Instantiate Data Model Validator + DMV = DataModelValidator(graph_data_model) + + # Run validation + validator_errors = DMV.check_is_dag() + + # nodes could be in different order so need to account for that + expected_errors = ['Schematic requires models be a directed acyclic graph (DAG). Please inspect your model.'] + + assert validator_errors[0] in expected_errors +