From 41d410f347dbd3c53a880caced4e8a51de6efec2 Mon Sep 17 00:00:00 2001 From: Birger Schacht Date: Thu, 23 Jan 2025 19:47:20 +0100 Subject: [PATCH] feat(utils): implement new type rdf importer --- .../triple_configs/E21_PersonFromDNB.toml | 13 ++++ apis_core/utils/rdf2.py | 61 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 apis_core/apis_entities/triple_configs/E21_PersonFromDNB.toml create mode 100644 apis_core/utils/rdf2.py diff --git a/apis_core/apis_entities/triple_configs/E21_PersonFromDNB.toml b/apis_core/apis_entities/triple_configs/E21_PersonFromDNB.toml new file mode 100644 index 000000000..326d3f056 --- /dev/null +++ b/apis_core/apis_entities/triple_configs/E21_PersonFromDNB.toml @@ -0,0 +1,13 @@ +[[filters]] +"rdf:type" = "gndo:DifferentiatedPerson" + +[attributes] +forename = ["gndo:forename", "gndo:preferredNameEntityForThePerson/gndo:forename"] +alternative_names = "gndo:variantNameForThePerson" +surname = ["gndo:surname", "gndo:preferredNameEntityForThePerson/gndo:surname"] +start_date_written = "gndo:dateOfBirth" +end_date_written = "gndo:dateOfDeath" +same_as = "owl:sameAs" +profession = "gndo:professionOrOccupation" + +relations = ["gndo:placeOfDeath", "gndo:placeOfBirth"] diff --git a/apis_core/utils/rdf2.py b/apis_core/utils/rdf2.py new file mode 100644 index 000000000..94c78fd07 --- /dev/null +++ b/apis_core/utils/rdf2.py @@ -0,0 +1,61 @@ +# SPDX-FileCopyrightText: 2025 Birger Schacht +# SPDX-License-Identifier: MIT + +import logging +from collections import defaultdict + +from AcdhArcheAssets.uri_norm_rules import get_normalized_uri +from rdflib import BNode, Graph, RDF + +from apis_core.utils.settings import dict_from_toml_directory + +logger = logging.getLogger(__name__) + + +def find_matching_config(graph: Graph) -> dict | None: + triple_configs = dict_from_toml_directory("triple_configs") + for path, config in triple_configs.items(): + for _filter in config.get("filters", []): + triples = [ + ( + None, + graph.namespace_manager.expand_curie(predicate), + graph.namespace_manager.expand_curie(obj), + ) + for predicate, obj in _filter.items() + ] + triples = [triple in graph for triple in triples] + if all(triples): + logger.debug("Using %s for parsing graph", path) + return config + return None + + +def get_something_from_uri(uri: str) -> dict | None: + uri = get_normalized_uri(uri) + graph = Graph() + graph.parse(uri) + + if config := find_matching_config(graph): + result = defaultdict(list) + result["relations"] = defaultdict(list) + + for attribute, curies in config.get("attributes", {}).items(): + if isinstance(curies, str): + curies = [curies] + for curie in curies: + values = [] + results = graph.query("SELECT ?object WHERE { ?subject " + curie + " ?object }") + objects = [result.object for result in results] + for obj in objects: + if isinstance(obj, BNode): + values.extend([value.toPython() for value in graph.objects(subject=obj) if value != RDF.Seq]) + else: + values.append(obj.toPython()) + + if attribute == "relations": + result["relations"][curie].extend(values) + else: + result[attribute].extend(values) + return dict(result) + return None