Skip to content

Commit

Permalink
Merge pull request #69 from Markus92/WP4-132_no_catalog_harvest
Browse files Browse the repository at this point in the history
Make catalog harvesting optional
  • Loading branch information
hcvdwerf authored Aug 7, 2024
2 parents cb9b75a + 20f829b commit 0f1fc2c
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 42 deletions.
10 changes: 4 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,11 @@ To install gdi-userportal-ckanext-fairdatapoint:

## Config settings

None at present
There is a setting `ckanext.fairdatapoint.harvest_catalogs`. Default is `false`. If set to `true`,
CKAN will harvest catalogs as datasets.

**TODO:** Document any optional config settings here. For example:

# The minimum number of hours to wait before re-checking a resource
# (optional, default: 24).
ckanext.fairdatapoint.some_setting = some_default_value
The setting can be overriden in the harvester profile, by setting `"harvest_catalogs": "true"` or
`"harvest_catalogs": "false"` in the harvester configuration JSON.


## Developer installation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,28 @@
from typing import Dict, Iterable, Union


LDP = Namespace('http://www.w3.org/ns/ldp#')
VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')
LDP = Namespace("http://www.w3.org/ns/ldp#")
VCARD = Namespace("http://www.w3.org/2006/vcard/ns#")

log = logging.getLogger(__name__)


class FairDataPointRecordProvider:

def __init__(self, fdp_end_point: str):
def __init__(self, fdp_end_point: str, harvest_catalogs: bool = False):
self.fair_data_point = FairDataPoint(fdp_end_point)
self.harvest_catalogs = harvest_catalogs

def get_record_ids(self) -> Dict.keys:
"""
Returns all the FDP records which should end up as packages in CKAN to populate the "guids_in_harvest" list
https://rdflib.readthedocs.io/en/stable/intro_to_parsing.html
"""
log.debug('FAIR Data Point get_records from {}'.format(self.fair_data_point.fdp_end_point))
log.debug(
"FAIR Data Point get_records from {}".format(
self.fair_data_point.fdp_end_point
)
)

result = dict()

Expand All @@ -52,20 +57,21 @@ def _process_catalog(self, path: Union[str, URIRef]) -> Dict:
catalogs_graph = self.fair_data_point.get_graph(path)

for catalog_subject in catalogs_graph.subjects(RDF.type, DCAT.Catalog):
identifier = Identifier('')
identifier = Identifier("")

identifier.add('catalog', str(catalog_subject))
identifier.add("catalog", str(catalog_subject))

result[identifier.guid] = catalog_subject
if self.harvest_catalogs:
result[identifier.guid] = catalog_subject

catalog_graph = self.fair_data_point.get_graph(catalog_subject)

for dataset_subject in catalog_graph.objects(predicate=DCAT.dataset):
identifier = Identifier('')
identifier = Identifier("")

identifier.add('catalog', str(catalog_subject))
identifier.add("catalog", str(catalog_subject))

identifier.add('dataset', str(dataset_subject))
identifier.add("dataset", str(dataset_subject))

result[identifier.guid] = dataset_subject

Expand All @@ -76,7 +82,10 @@ def get_record_by_id(self, guid: str) -> str:
Get additional information for FDP record.
"""
log.debug(
'FAIR data point get_record_by_id from {} for {}'.format(self.fair_data_point.fdp_end_point, guid))
"FAIR data point get_record_by_id from {} for {}".format(
self.fair_data_point.fdp_end_point, guid
)
)

identifier = Identifier(guid)

Expand All @@ -89,7 +98,9 @@ def get_record_by_id(self, guid: str) -> str:
self._remove_fdp_defaults(g, subject_uri)

# Add information from distribution to graph
for distribution_uri in g.objects(subject=subject_uri, predicate=DCAT.distribution):
for distribution_uri in g.objects(
subject=subject_uri, predicate=DCAT.distribution
):
distribution_g = self.fair_data_point.get_graph(distribution_uri)

self._remove_fdp_defaults(g, distribution_uri)
Expand All @@ -99,17 +110,21 @@ def get_record_by_id(self, guid: str) -> str:
DCTERMS.format,
DCTERMS.license,
DCTERMS.title,
DCAT.accessURL
DCAT.accessURL,
]:
for distr_attribute_value in self.get_values(distribution_g, distribution_uri, predicate):
for distr_attribute_value in self.get_values(
distribution_g, distribution_uri, predicate
):
g.add((distribution_uri, predicate, distr_attribute_value))

# Look-up contact information
for contact_point_uri in self.get_values(g, subject_uri, DCAT.contactPoint):
if isinstance(contact_point_uri, URIRef):
self._parse_contact_point(g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri)
self._parse_contact_point(
g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri
)

result = g.serialize(format='ttl')
result = g.serialize(format="ttl")

return result

Expand All @@ -123,19 +138,23 @@ def _parse_contact_point(g: Graph, subject_uri: URIRef, contact_point_uri: URIRe
g.add((subject_uri, DCAT.contactPoint, vcard_node))
g.add((vcard_node, RDF.type, VCARD.Kind))
g.add((vcard_node, VCARD.hasUID, contact_point_uri))
if 'orcid' in str(contact_point_uri):
if "orcid" in str(contact_point_uri):
try:
orcid_response = requests.get(str(contact_point_uri).rstrip('/') + '/public-record.json')
orcid_response = requests.get(
str(contact_point_uri).rstrip("/") + "/public-record.json"
)
json_orcid_response = orcid_response.json()
name = json_orcid_response['displayName']
name = json_orcid_response["displayName"]
g.add((vcard_node, VCARD.fn, Literal(name)))
except (JSONDecodeError, HTTPError) as e:
log.error(f'Failed to get data from ORCID for {contact_point_uri}: {e}')
log.error(f"Failed to get data from ORCID for {contact_point_uri}: {e}")

@staticmethod
def get_values(graph: Graph,
subject: Union[str, URIRef, Node],
predicate: Union[str, URIRef, Node]) -> Iterable[Node]:
def get_values(
graph: Graph,
subject: Union[str, URIRef, Node],
predicate: Union[str, URIRef, Node],
) -> Iterable[Node]:
subject_uri = URIRef(subject)
predicate_uri = URIRef(predicate)

Expand All @@ -144,8 +163,8 @@ def get_values(graph: Graph,

@staticmethod
def _remove_fdp_defaults(g, subject_uri):
for (s, p, o) in g.triples((subject_uri, DCTERMS.accessRights, None)):
access_rights_default = URIRef(f'{subject_uri}#accessRights')
for s, p, o in g.triples((subject_uri, DCTERMS.accessRights, None)):
access_rights_default = URIRef(f"{subject_uri}#accessRights")
if o == access_rights_default:
g.remove((subject_uri, DCTERMS.accessRights, o))
g.remove((access_rights_default, None, None))
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,59 @@
# SPDX-FileContributor: 2024 Stichting Health-RI
#
# SPDX-License-Identifier: AGPL-3.0-only

import logging

from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import FairDataPointRecordProvider
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import \
FairDataPointRecordToPackageConverter
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import (
FairDataPointRecordProvider,
)
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
FairDataPointRecordToPackageConverter,
)
from ckan.plugins import toolkit

PROFILE = "profile"
HARVEST_CATALOG = "harvest_catalogs"
HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"

PROFILE = 'profile'
log = logging.getLogger(__name__)


class FairDataPointCivityHarvester(CivityHarvester):

def _get_harvest_catalog_setting(self, harvest_config_dict):
if HARVEST_CATALOG in harvest_config_dict:
log.debug("Using harvest_catalogs from harvest_config_dict")
harvest_catalog_setting = toolkit.asbool(
harvest_config_dict[HARVEST_CATALOG]
)
else:
log.debug("Using harvest_catalogs from global CKAN config")
harvest_catalog_setting = toolkit.asbool(
toolkit.config.get(HARVEST_CATALOG_CONFIG, False)
)
log.debug("Harvesting catalogs is set to %s", harvest_catalog_setting)
return harvest_catalog_setting

def setup_record_provider(self, harvest_url, harvest_config_dict):
self.record_provider = FairDataPointRecordProvider(harvest_url)
# Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config
harvest_catalogs = self._get_harvest_catalog_setting(harvest_config_dict)

self.record_provider = FairDataPointRecordProvider(
harvest_url, harvest_catalogs
)

def setup_record_to_package_converter(self, harvest_url, harvest_config_dict):
if PROFILE in harvest_config_dict:
self.record_to_package_converter = FairDataPointRecordToPackageConverter(harvest_config_dict.get(PROFILE))
self.record_to_package_converter = FairDataPointRecordToPackageConverter(
harvest_config_dict.get(PROFILE)
)
else:
raise Exception('[{0}] not found in harvester config JSON'.format(PROFILE))
raise Exception("[{0}] not found in harvester config JSON".format(PROFILE))

def info(self):
return {
'name': 'fair_data_point_harvester',
'title': 'FAIR data point harvester',
'description': 'Harvester for end points implementing the FAIR data point protocol'
"name": "fair_data_point_harvester",
"title": "FAIR data point harvester",
"description": "Harvester for end points implementing the FAIR data point protocol",
}

0 comments on commit 0f1fc2c

Please sign in to comment.