Skip to content

Commit

Permalink
LOC ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
docuracy committed Jan 28, 2025
1 parent 09f044b commit 48c1894
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 10 deletions.
2 changes: 2 additions & 0 deletions vespa/repository/api/ingestion/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@
"madsrdf:GeographicElement" in graph_item.get("@type", [])
for graph_item in record.get("@graph", [])
) and any(
"madsrdf:identifiesRWO" in graph_item
or
"madsrdf:hasCloseExternalAuthority" in graph_item
or
(
Expand Down
18 changes: 10 additions & 8 deletions vespa/repository/api/ingestion/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,9 @@ async def process_document(document, dataset_config, transformer_index, sync_app

# logger.info(f"Transformed document {transformed_document}")
# logger.info(f"Toponyms: {toponyms}")
# logger.info(f"Links: {links}")
logger.info(f"Links: {links}")
# terminate
return {"success": True}

try:
response = await asyncio.get_event_loop().run_in_executor(
Expand Down Expand Up @@ -305,13 +307,13 @@ async def process_batch(batch):
continue

## Examine the first 3 documents and then terminate
if count < 50:
logger.info(f"Document {count}: {document}")
count += 1
# Get next document
continue
else: # Terminate
break
# if count < 3:
# logger.info(f"Document {count}: {document}")
# count += 1
# # Get next document
# continue
# else: # Terminate
# break

current_batch.append(document)
count += 1
Expand Down
Empty file.
Empty file.
81 changes: 81 additions & 0 deletions vespa/repository/api/ingestion/subtransformers/loc/links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import logging
from typing import List, Dict, Any

logger = logging.getLogger(__name__)


class LinksProcessor:
def __init__(self, graph: List[Dict[str, Any]]):
self.graph = graph
self.linkfacets = [
"hasExactExternalAuthority",
"hasCloseExternalAuthority",
"identifiesRWO",
]
self.ignore_urls = [
"http://viaf.org/viaf/sourceID/",
"http://musicbrainz.org/",
]
self.url_transformers = {
"http://dbpedia.org/resource/": {
"uri": lambda url: f"dbp:{url.split('/')[-1]}",
},
"http://sws.geonames.org/": {
"uri": lambda url: f"gn:{url.split('/')[-1]}",
},
"http://id.loc.gov/rwo/agents/": {
"uri": lambda url: f"loc:{url.split('/')[-1]}",
},
"http://vocab.getty.edu/tgn/": {
"uri": lambda url: f"tgn:{url.split('/')[-1].removesuffix('-place')}",
},
"http://www.viaf.org/viaf/": {
"uri": lambda url: f"viaf:{url.split('/')[-1]}",
},
"http://www.wikidata.org/entity/": {
"uri": lambda url: f"wd:{url.split('/')[-1]}",
},
}
self.uris = set()
self.links = []

def _check_url(self, url: str) -> None:
for prefix in self.ignore_urls:
if url.startswith(prefix):
break
for prefix, transformers in self.url_transformers.items():
if url.startswith(prefix):
self.uris.add(transformers["uri"](url))
break
else:
logger.warning(f"Unmatched URL: {url}")

def process(self) -> List[Dict[str, Any]]:

# Populate self.uris using self.linkfacets from self.graph
for item in self.graph:
for facet in self.linkfacets:
# Check if the facet exists in the item and extract the URIs
if facet in item:
facet_value = item[facet]
if isinstance(facet_value, dict):
facet_value = [facet_value]
for value in facet_value:
if isinstance(value, dict) and "@id" in value:
self._check_url(value["@id"])

# Generate a link for every unique combination of self.uris (no symmetrical links)
seen_pairs = set() # Track pairs to avoid duplicates
self.links.extend([
{
"place_curie": x,
"predicate": "owl:sameAs",
"object": y,
}
for x in self.uris
for y in self.uris
if x != y and (x, y) not in seen_pairs and not seen_pairs.add((x, y))
])

# logger.info(f"Processed links: {self.links}")
return self.links
Empty file.
Empty file.
Empty file.
4 changes: 2 additions & 2 deletions vespa/repository/api/ingestion/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging

from .subtransformers.geonames.names import NamesProcessor as GeonamesNamesProcessor
from .subtransformers.loc.links import LinksProcessor as LOCLinksProcessor
from .subtransformers.osm.names import NamesProcessor as OSMNamesProcessor
from .subtransformers.osm.types import TypesProcessor as OSMTypesProcessor
from .subtransformers.pleiades.links import LinksProcessor as PleiadesLinksProcessor
Expand Down Expand Up @@ -279,8 +280,7 @@ class DocTransformer:
},
[
],
[ # No links
]
LOCLinksProcessor(data.get("@graph")).process()
)
],
"GB1900": [ # TODO
Expand Down

0 comments on commit 48c1894

Please sign in to comment.