From d694918aaffb5ca23fbc8e0186df377460a5733b Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Mon, 5 Aug 2024 14:30:34 +0200 Subject: [PATCH 1/5] feat: add option for harvesting catalogs --- .../domain/fair_data_point_record_provider.py | 69 ++++++++++++------- .../fair_data_point_civity_harvester.py | 39 ++++++++--- 2 files changed, 73 insertions(+), 35 deletions(-) diff --git a/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py b/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py index 6e3f7c8..0ae60c9 100644 --- a/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py +++ b/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py @@ -18,23 +18,28 @@ from typing import Dict, Iterable, Union -LDP = Namespace('http://www.w3.org/ns/ldp#') -VCARD = Namespace('http://www.w3.org/2006/vcard/ns#') +LDP = Namespace("http://www.w3.org/ns/ldp#") +VCARD = Namespace("http://www.w3.org/2006/vcard/ns#") log = logging.getLogger(__name__) class FairDataPointRecordProvider: - def __init__(self, fdp_end_point: str): + def __init__(self, fdp_end_point: str, harvest_catalogs: bool = False): self.fair_data_point = FairDataPoint(fdp_end_point) + self.harvest_catalogs = harvest_catalogs def get_record_ids(self) -> Dict.keys: """ Returns all the FDP records which should end up as packages in CKAN to populate the "guids_in_harvest" list https://rdflib.readthedocs.io/en/stable/intro_to_parsing.html """ - log.debug('FAIR Data Point get_records from {}'.format(self.fair_data_point.fdp_end_point)) + log.debug( + "FAIR Data Point get_records from {}".format( + self.fair_data_point.fdp_end_point + ) + ) result = dict() @@ -52,20 +57,21 @@ def _process_catalog(self, path: Union[str, URIRef]) -> Dict: catalogs_graph = self.fair_data_point.get_graph(path) for catalog_subject in catalogs_graph.subjects(RDF.type, DCAT.Catalog): - identifier = Identifier('') + identifier = Identifier("") - identifier.add('catalog', str(catalog_subject)) + identifier.add("catalog", str(catalog_subject)) - result[identifier.guid] = catalog_subject + if self.harvest_catalogs: + result[identifier.guid] = catalog_subject catalog_graph = self.fair_data_point.get_graph(catalog_subject) for dataset_subject in catalog_graph.objects(predicate=DCAT.dataset): - identifier = Identifier('') + identifier = Identifier("") - identifier.add('catalog', str(catalog_subject)) + identifier.add("catalog", str(catalog_subject)) - identifier.add('dataset', str(dataset_subject)) + identifier.add("dataset", str(dataset_subject)) result[identifier.guid] = dataset_subject @@ -76,7 +82,10 @@ def get_record_by_id(self, guid: str) -> str: Get additional information for FDP record. """ log.debug( - 'FAIR data point get_record_by_id from {} for {}'.format(self.fair_data_point.fdp_end_point, guid)) + "FAIR data point get_record_by_id from {} for {}".format( + self.fair_data_point.fdp_end_point, guid + ) + ) identifier = Identifier(guid) @@ -89,7 +98,9 @@ def get_record_by_id(self, guid: str) -> str: self._remove_fdp_defaults(g, subject_uri) # Add information from distribution to graph - for distribution_uri in g.objects(subject=subject_uri, predicate=DCAT.distribution): + for distribution_uri in g.objects( + subject=subject_uri, predicate=DCAT.distribution + ): distribution_g = self.fair_data_point.get_graph(distribution_uri) self._remove_fdp_defaults(g, distribution_uri) @@ -99,17 +110,21 @@ def get_record_by_id(self, guid: str) -> str: DCTERMS.format, DCTERMS.license, DCTERMS.title, - DCAT.accessURL + DCAT.accessURL, ]: - for distr_attribute_value in self.get_values(distribution_g, distribution_uri, predicate): + for distr_attribute_value in self.get_values( + distribution_g, distribution_uri, predicate + ): g.add((distribution_uri, predicate, distr_attribute_value)) # Look-up contact information for contact_point_uri in self.get_values(g, subject_uri, DCAT.contactPoint): if isinstance(contact_point_uri, URIRef): - self._parse_contact_point(g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri) + self._parse_contact_point( + g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri + ) - result = g.serialize(format='ttl') + result = g.serialize(format="ttl") return result @@ -123,19 +138,23 @@ def _parse_contact_point(g: Graph, subject_uri: URIRef, contact_point_uri: URIRe g.add((subject_uri, DCAT.contactPoint, vcard_node)) g.add((vcard_node, RDF.type, VCARD.Kind)) g.add((vcard_node, VCARD.hasUID, contact_point_uri)) - if 'orcid' in str(contact_point_uri): + if "orcid" in str(contact_point_uri): try: - orcid_response = requests.get(str(contact_point_uri).rstrip('/') + '/public-record.json') + orcid_response = requests.get( + str(contact_point_uri).rstrip("/") + "/public-record.json" + ) json_orcid_response = orcid_response.json() - name = json_orcid_response['displayName'] + name = json_orcid_response["displayName"] g.add((vcard_node, VCARD.fn, Literal(name))) except (JSONDecodeError, HTTPError) as e: - log.error(f'Failed to get data from ORCID for {contact_point_uri}: {e}') + log.error(f"Failed to get data from ORCID for {contact_point_uri}: {e}") @staticmethod - def get_values(graph: Graph, - subject: Union[str, URIRef, Node], - predicate: Union[str, URIRef, Node]) -> Iterable[Node]: + def get_values( + graph: Graph, + subject: Union[str, URIRef, Node], + predicate: Union[str, URIRef, Node], + ) -> Iterable[Node]: subject_uri = URIRef(subject) predicate_uri = URIRef(predicate) @@ -144,8 +163,8 @@ def get_values(graph: Graph, @staticmethod def _remove_fdp_defaults(g, subject_uri): - for (s, p, o) in g.triples((subject_uri, DCTERMS.accessRights, None)): - access_rights_default = URIRef(f'{subject_uri}#accessRights') + for s, p, o in g.triples((subject_uri, DCTERMS.accessRights, None)): + access_rights_default = URIRef(f"{subject_uri}#accessRights") if o == access_rights_default: g.remove((subject_uri, DCTERMS.accessRights, o)) g.remove((access_rights_default, None, None)) diff --git a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py index b91b4a5..b75fded 100644 --- a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py +++ b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py @@ -5,27 +5,46 @@ from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester -from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import FairDataPointRecordProvider -from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import \ - FairDataPointRecordToPackageConverter +from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import ( + FairDataPointRecordProvider, +) +from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import ( + FairDataPointRecordToPackageConverter, +) +from ckan.plugins import toolkit -PROFILE = 'profile' +PROFILE = "profile" +HARVEST_CATALOG = "harvest_catalogs" +HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs" class FairDataPointCivityHarvester(CivityHarvester): def setup_record_provider(self, harvest_url, harvest_config_dict): - self.record_provider = FairDataPointRecordProvider(harvest_url) + # Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config + harvest_catalogs = toolkit.asbool( + toolkit.config.get(HARVEST_CATALOG_CONFIG, False) + ) + if HARVEST_CATALOG in harvest_config_dict: + harvest_catalogs = toolkit.asbool( + harvest_config_dict.get(HARVEST_CATALOG, False) + ) + + self.record_provider = FairDataPointRecordProvider( + harvest_url, harvest_catalogs + ) def setup_record_to_package_converter(self, harvest_url, harvest_config_dict): if PROFILE in harvest_config_dict: - self.record_to_package_converter = FairDataPointRecordToPackageConverter(harvest_config_dict.get(PROFILE)) + self.record_to_package_converter = FairDataPointRecordToPackageConverter( + harvest_config_dict.get(PROFILE) + ) else: - raise Exception('[{0}] not found in harvester config JSON'.format(PROFILE)) + raise Exception("[{0}] not found in harvester config JSON".format(PROFILE)) def info(self): return { - 'name': 'fair_data_point_harvester', - 'title': 'FAIR data point harvester', - 'description': 'Harvester for end points implementing the FAIR data point protocol' + "name": "fair_data_point_harvester", + "title": "FAIR data point harvester", + "description": "Harvester for end points implementing the FAIR data point protocol", } From 7972a13aaceb36bbc3780c176f9ab6f44af77fd7 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Mon, 5 Aug 2024 15:27:15 +0200 Subject: [PATCH 2/5] Added logging option --- .../harvesters/fair_data_point_civity_harvester.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py index b75fded..adde852 100644 --- a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py +++ b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py @@ -2,7 +2,7 @@ # SPDX-FileContributor: 2024 Stichting Health-RI # # SPDX-License-Identifier: AGPL-3.0-only - +import logging from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import ( @@ -17,6 +17,8 @@ HARVEST_CATALOG = "harvest_catalogs" HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs" +log = logging.getLogger(__name__) + class FairDataPointCivityHarvester(CivityHarvester): @@ -29,6 +31,9 @@ def setup_record_provider(self, harvest_url, harvest_config_dict): harvest_catalogs = toolkit.asbool( harvest_config_dict.get(HARVEST_CATALOG, False) ) + log.debug("harvest_catalogs from harvester config: %s", harvest_catalogs) + else: + log.debug("harvest_catalogs from ckan config: %s", harvest_catalogs) self.record_provider = FairDataPointRecordProvider( harvest_url, harvest_catalogs From c9b66252e4252c61714b466aadcb961f4e423106 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Mon, 5 Aug 2024 15:59:26 +0200 Subject: [PATCH 3/5] Documentation update for harvest_catalogs option --- README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0383db0..8611017 100644 --- a/README.md +++ b/README.md @@ -86,13 +86,11 @@ To install gdi-userportal-ckanext-fairdatapoint: ## Config settings -None at present +There is a setting `ckanext.fairdatapoint.harvest_catalogs`. Default is `false`. If set to `true`, +ckan will harvest catalogs as datasets. -**TODO:** Document any optional config settings here. For example: - - # The minimum number of hours to wait before re-checking a resource - # (optional, default: 24). - ckanext.fairdatapoint.some_setting = some_default_value +The setting can be overriden in the harvester profile, by setting `"harvest_catalogs": "true"` or +`"harvest_catalogs": "false"` in the harvester configuration JSON. ## Developer installation From 367f5628ce037363e91602ef1b1828a05e808f13 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Mon, 5 Aug 2024 16:08:04 +0200 Subject: [PATCH 4/5] Fix capitalization to make bot happy --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8611017..4a1d950 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ To install gdi-userportal-ckanext-fairdatapoint: ## Config settings There is a setting `ckanext.fairdatapoint.harvest_catalogs`. Default is `false`. If set to `true`, -ckan will harvest catalogs as datasets. +CKAN will harvest catalogs as datasets. The setting can be overriden in the harvester profile, by setting `"harvest_catalogs": "true"` or `"harvest_catalogs": "false"` in the harvester configuration JSON. From 8e68b25fe3a7e968cc6300d12d5cd27293ca7198 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Tue, 6 Aug 2024 15:38:22 +0200 Subject: [PATCH 5/5] Split setting to different function --- .../fair_data_point_civity_harvester.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py index adde852..5d3bd28 100644 --- a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py +++ b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py @@ -22,18 +22,23 @@ class FairDataPointCivityHarvester(CivityHarvester): - def setup_record_provider(self, harvest_url, harvest_config_dict): - # Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config - harvest_catalogs = toolkit.asbool( - toolkit.config.get(HARVEST_CATALOG_CONFIG, False) - ) + def _get_harvest_catalog_setting(self, harvest_config_dict): if HARVEST_CATALOG in harvest_config_dict: - harvest_catalogs = toolkit.asbool( - harvest_config_dict.get(HARVEST_CATALOG, False) + log.debug("Using harvest_catalogs from harvest_config_dict") + harvest_catalog_setting = toolkit.asbool( + harvest_config_dict[HARVEST_CATALOG] ) - log.debug("harvest_catalogs from harvester config: %s", harvest_catalogs) else: - log.debug("harvest_catalogs from ckan config: %s", harvest_catalogs) + log.debug("Using harvest_catalogs from global CKAN config") + harvest_catalog_setting = toolkit.asbool( + toolkit.config.get(HARVEST_CATALOG_CONFIG, False) + ) + log.debug("Harvesting catalogs is set to %s", harvest_catalog_setting) + return harvest_catalog_setting + + def setup_record_provider(self, harvest_url, harvest_config_dict): + # Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config + harvest_catalogs = self._get_harvest_catalog_setting(harvest_config_dict) self.record_provider = FairDataPointRecordProvider( harvest_url, harvest_catalogs