From 48c1894d754ac15dfcb0238a77220d579b49f40b Mon Sep 17 00:00:00 2001 From: Stephen Gadd Date: Tue, 28 Jan 2025 12:54:03 +0000 Subject: [PATCH] LOC ingestion --- vespa/repository/api/ingestion/config.py | 2 + vespa/repository/api/ingestion/processor.py | 18 +++-- .../subtransformers/geonames/__init__.py | 0 .../ingestion/subtransformers/loc/__init__.py | 0 .../ingestion/subtransformers/loc/links.py | 81 +++++++++++++++++++ .../ingestion/subtransformers/osm/__init__.py | 0 .../ingestion/subtransformers/tgn/__init__.py | 0 .../subtransformers/wikidata/__init__.py | 0 .../repository/api/ingestion/transformers.py | 4 +- 9 files changed, 95 insertions(+), 10 deletions(-) create mode 100644 vespa/repository/api/ingestion/subtransformers/geonames/__init__.py create mode 100644 vespa/repository/api/ingestion/subtransformers/loc/__init__.py create mode 100644 vespa/repository/api/ingestion/subtransformers/loc/links.py create mode 100644 vespa/repository/api/ingestion/subtransformers/osm/__init__.py create mode 100644 vespa/repository/api/ingestion/subtransformers/tgn/__init__.py create mode 100644 vespa/repository/api/ingestion/subtransformers/wikidata/__init__.py diff --git a/vespa/repository/api/ingestion/config.py b/vespa/repository/api/ingestion/config.py index 074c4172..68d76e99 100644 --- a/vespa/repository/api/ingestion/config.py +++ b/vespa/repository/api/ingestion/config.py @@ -143,6 +143,8 @@ "madsrdf:GeographicElement" in graph_item.get("@type", []) for graph_item in record.get("@graph", []) ) and any( + "madsrdf:identifiesRWO" in graph_item + or "madsrdf:hasCloseExternalAuthority" in graph_item or ( diff --git a/vespa/repository/api/ingestion/processor.py b/vespa/repository/api/ingestion/processor.py index bc7437d9..c2f58cd5 100644 --- a/vespa/repository/api/ingestion/processor.py +++ b/vespa/repository/api/ingestion/processor.py @@ -219,7 +219,9 @@ async def process_document(document, dataset_config, transformer_index, sync_app # logger.info(f"Transformed document {transformed_document}") # logger.info(f"Toponyms: {toponyms}") - # logger.info(f"Links: {links}") + logger.info(f"Links: {links}") + # terminate + return {"success": True} try: response = await asyncio.get_event_loop().run_in_executor( @@ -305,13 +307,13 @@ async def process_batch(batch): continue ## Examine the first 3 documents and then terminate - if count < 50: - logger.info(f"Document {count}: {document}") - count += 1 - # Get next document - continue - else: # Terminate - break + # if count < 3: + # logger.info(f"Document {count}: {document}") + # count += 1 + # # Get next document + # continue + # else: # Terminate + # break current_batch.append(document) count += 1 diff --git a/vespa/repository/api/ingestion/subtransformers/geonames/__init__.py b/vespa/repository/api/ingestion/subtransformers/geonames/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vespa/repository/api/ingestion/subtransformers/loc/__init__.py b/vespa/repository/api/ingestion/subtransformers/loc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vespa/repository/api/ingestion/subtransformers/loc/links.py b/vespa/repository/api/ingestion/subtransformers/loc/links.py new file mode 100644 index 00000000..d343c080 --- /dev/null +++ b/vespa/repository/api/ingestion/subtransformers/loc/links.py @@ -0,0 +1,81 @@ +import logging +from typing import List, Dict, Any + +logger = logging.getLogger(__name__) + + +class LinksProcessor: + def __init__(self, graph: List[Dict[str, Any]]): + self.graph = graph + self.linkfacets = [ + "hasExactExternalAuthority", + "hasCloseExternalAuthority", + "identifiesRWO", + ] + self.ignore_urls = [ + "http://viaf.org/viaf/sourceID/", + "http://musicbrainz.org/", + ] + self.url_transformers = { + "http://dbpedia.org/resource/": { + "uri": lambda url: f"dbp:{url.split('/')[-1]}", + }, + "http://sws.geonames.org/": { + "uri": lambda url: f"gn:{url.split('/')[-1]}", + }, + "http://id.loc.gov/rwo/agents/": { + "uri": lambda url: f"loc:{url.split('/')[-1]}", + }, + "http://vocab.getty.edu/tgn/": { + "uri": lambda url: f"tgn:{url.split('/')[-1].removesuffix('-place')}", + }, + "http://www.viaf.org/viaf/": { + "uri": lambda url: f"viaf:{url.split('/')[-1]}", + }, + "http://www.wikidata.org/entity/": { + "uri": lambda url: f"wd:{url.split('/')[-1]}", + }, + } + self.uris = set() + self.links = [] + + def _check_url(self, url: str) -> None: + for prefix in self.ignore_urls: + if url.startswith(prefix): + break + for prefix, transformers in self.url_transformers.items(): + if url.startswith(prefix): + self.uris.add(transformers["uri"](url)) + break + else: + logger.warning(f"Unmatched URL: {url}") + + def process(self) -> List[Dict[str, Any]]: + + # Populate self.uris using self.linkfacets from self.graph + for item in self.graph: + for facet in self.linkfacets: + # Check if the facet exists in the item and extract the URIs + if facet in item: + facet_value = item[facet] + if isinstance(facet_value, dict): + facet_value = [facet_value] + for value in facet_value: + if isinstance(value, dict) and "@id" in value: + self._check_url(value["@id"]) + + # Generate a link for every unique combination of self.uris (no symmetrical links) + seen_pairs = set() # Track pairs to avoid duplicates + self.links.extend([ + { + "place_curie": x, + "predicate": "owl:sameAs", + "object": y, + } + for x in self.uris + for y in self.uris + if x != y and (x, y) not in seen_pairs and not seen_pairs.add((x, y)) + ]) + + # logger.info(f"Processed links: {self.links}") + return self.links diff --git a/vespa/repository/api/ingestion/subtransformers/osm/__init__.py b/vespa/repository/api/ingestion/subtransformers/osm/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vespa/repository/api/ingestion/subtransformers/tgn/__init__.py b/vespa/repository/api/ingestion/subtransformers/tgn/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vespa/repository/api/ingestion/subtransformers/wikidata/__init__.py b/vespa/repository/api/ingestion/subtransformers/wikidata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vespa/repository/api/ingestion/transformers.py b/vespa/repository/api/ingestion/transformers.py index bd0f03ed..38441ffa 100644 --- a/vespa/repository/api/ingestion/transformers.py +++ b/vespa/repository/api/ingestion/transformers.py @@ -3,6 +3,7 @@ import logging from .subtransformers.geonames.names import NamesProcessor as GeonamesNamesProcessor +from .subtransformers.loc.links import LinksProcessor as LOCLinksProcessor from .subtransformers.osm.names import NamesProcessor as OSMNamesProcessor from .subtransformers.osm.types import TypesProcessor as OSMTypesProcessor from .subtransformers.pleiades.links import LinksProcessor as PleiadesLinksProcessor @@ -279,8 +280,7 @@ class DocTransformer: }, [ ], - [ # No links - ] + LOCLinksProcessor(data.get("@graph")).process() ) ], "GB1900": [ # TODO