Merge pull request #69 from Markus92/WP4-132_no_catalog_harvest

Make catalog harvesting optional
GenomicDataInfrastructure · Aug 7, 2024 · 0f1fc2c · 0f1fc2c
2 parents cb9b75a + 20f829b
commit 0f1fc2c
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -86,13 +86,11 @@ To install gdi-userportal-ckanext-fairdatapoint:
 
 ## Config settings
 
-None at present
+There is a setting `ckanext.fairdatapoint.harvest_catalogs`. Default is `false`. If set to `true`,
+CKAN will harvest catalogs as datasets.
 
-**TODO:** Document any optional config settings here. For example:
-
-	# The minimum number of hours to wait before re-checking a resource
-	# (optional, default: 24).
-	ckanext.fairdatapoint.some_setting = some_default_value
+The setting can be overriden in the harvester profile, by setting `"harvest_catalogs": "true"` or
+`"harvest_catalogs": "false"` in the harvester configuration JSON.
 
 
 ## Developer installation

diff --git a/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py b/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py
@@ -18,23 +18,28 @@
 from typing import Dict, Iterable, Union
 
 
-LDP = Namespace('http://www.w3.org/ns/ldp#')
-VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')
+LDP = Namespace("http://www.w3.org/ns/ldp#")
+VCARD = Namespace("http://www.w3.org/2006/vcard/ns#")
 
 log = logging.getLogger(__name__)
 
 
 class FairDataPointRecordProvider:
 
-    def __init__(self, fdp_end_point: str):
+    def __init__(self, fdp_end_point: str, harvest_catalogs: bool = False):
         self.fair_data_point = FairDataPoint(fdp_end_point)
+        self.harvest_catalogs = harvest_catalogs
 
     def get_record_ids(self) -> Dict.keys:
         """
         Returns all the FDP records which should end up as packages in CKAN to populate the "guids_in_harvest" list
         https://rdflib.readthedocs.io/en/stable/intro_to_parsing.html
         """
-        log.debug('FAIR Data Point get_records from {}'.format(self.fair_data_point.fdp_end_point))
+        log.debug(
+            "FAIR Data Point get_records from {}".format(
+                self.fair_data_point.fdp_end_point
+            )
+        )
 
         result = dict()
 
@@ -52,20 +57,21 @@ def _process_catalog(self, path: Union[str, URIRef]) -> Dict:
         catalogs_graph = self.fair_data_point.get_graph(path)
 
         for catalog_subject in catalogs_graph.subjects(RDF.type, DCAT.Catalog):
-            identifier = Identifier('')
+            identifier = Identifier("")
 
-            identifier.add('catalog', str(catalog_subject))
+            identifier.add("catalog", str(catalog_subject))
 
-            result[identifier.guid] = catalog_subject
+            if self.harvest_catalogs:
+                result[identifier.guid] = catalog_subject
 
             catalog_graph = self.fair_data_point.get_graph(catalog_subject)
 
             for dataset_subject in catalog_graph.objects(predicate=DCAT.dataset):
-                identifier = Identifier('')
+                identifier = Identifier("")
 
-                identifier.add('catalog', str(catalog_subject))
+                identifier.add("catalog", str(catalog_subject))
 
-                identifier.add('dataset', str(dataset_subject))
+                identifier.add("dataset", str(dataset_subject))
 
                 result[identifier.guid] = dataset_subject
 
@@ -76,7 +82,10 @@ def get_record_by_id(self, guid: str) -> str:
         Get additional information for FDP record.
         """
         log.debug(
-            'FAIR data point get_record_by_id from {} for {}'.format(self.fair_data_point.fdp_end_point, guid))
+            "FAIR data point get_record_by_id from {} for {}".format(
+                self.fair_data_point.fdp_end_point, guid
+            )
+        )
 
         identifier = Identifier(guid)
 
@@ -89,7 +98,9 @@ def get_record_by_id(self, guid: str) -> str:
         self._remove_fdp_defaults(g, subject_uri)
 
         # Add information from distribution to graph
-        for distribution_uri in g.objects(subject=subject_uri, predicate=DCAT.distribution):
+        for distribution_uri in g.objects(
+            subject=subject_uri, predicate=DCAT.distribution
+        ):
             distribution_g = self.fair_data_point.get_graph(distribution_uri)
 
             self._remove_fdp_defaults(g, distribution_uri)
@@ -99,17 +110,21 @@ def get_record_by_id(self, guid: str) -> str:
                 DCTERMS.format,
                 DCTERMS.license,
                 DCTERMS.title,
-                DCAT.accessURL
+                DCAT.accessURL,
             ]:
-                for distr_attribute_value in self.get_values(distribution_g, distribution_uri, predicate):
+                for distr_attribute_value in self.get_values(
+                    distribution_g, distribution_uri, predicate
+                ):
                     g.add((distribution_uri, predicate, distr_attribute_value))
 
         # Look-up contact information
         for contact_point_uri in self.get_values(g, subject_uri, DCAT.contactPoint):
             if isinstance(contact_point_uri, URIRef):
-                self._parse_contact_point(g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri)
+                self._parse_contact_point(
+                    g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri
+                )
 
-        result = g.serialize(format='ttl')
+        result = g.serialize(format="ttl")
 
         return result
 
@@ -123,19 +138,23 @@ def _parse_contact_point(g: Graph, subject_uri: URIRef, contact_point_uri: URIRe
         g.add((subject_uri, DCAT.contactPoint, vcard_node))
         g.add((vcard_node, RDF.type, VCARD.Kind))
         g.add((vcard_node, VCARD.hasUID, contact_point_uri))
-        if 'orcid' in str(contact_point_uri):
+        if "orcid" in str(contact_point_uri):
             try:
-                orcid_response = requests.get(str(contact_point_uri).rstrip('/') + '/public-record.json')
+                orcid_response = requests.get(
+                    str(contact_point_uri).rstrip("/") + "/public-record.json"
+                )
                 json_orcid_response = orcid_response.json()
-                name = json_orcid_response['displayName']
+                name = json_orcid_response["displayName"]
                 g.add((vcard_node, VCARD.fn, Literal(name)))
             except (JSONDecodeError, HTTPError) as e:
-                log.error(f'Failed to get data from ORCID for {contact_point_uri}: {e}')
+                log.error(f"Failed to get data from ORCID for {contact_point_uri}: {e}")
 
     @staticmethod
-    def get_values(graph: Graph,
-                   subject: Union[str, URIRef, Node],
-                   predicate: Union[str, URIRef, Node]) -> Iterable[Node]:
+    def get_values(
+        graph: Graph,
+        subject: Union[str, URIRef, Node],
+        predicate: Union[str, URIRef, Node],
+    ) -> Iterable[Node]:
         subject_uri = URIRef(subject)
         predicate_uri = URIRef(predicate)
 
@@ -144,8 +163,8 @@ def get_values(graph: Graph,
 
     @staticmethod
     def _remove_fdp_defaults(g, subject_uri):
-        for (s, p, o) in g.triples((subject_uri, DCTERMS.accessRights, None)):
-            access_rights_default = URIRef(f'{subject_uri}#accessRights')
+        for s, p, o in g.triples((subject_uri, DCTERMS.accessRights, None)):
+            access_rights_default = URIRef(f"{subject_uri}#accessRights")
             if o == access_rights_default:
                 g.remove((subject_uri, DCTERMS.accessRights, o))
                 g.remove((access_rights_default, None, None))
diff --git a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
@@ -2,30 +2,59 @@
 # SPDX-FileContributor: 2024 Stichting Health-RI
 #
 # SPDX-License-Identifier: AGPL-3.0-only
-
+import logging
 
 from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
-from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import FairDataPointRecordProvider
-from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import \
-    FairDataPointRecordToPackageConverter
+from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import (
+    FairDataPointRecordProvider,
+)
+from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
+    FairDataPointRecordToPackageConverter,
+)
+from ckan.plugins import toolkit
+
+PROFILE = "profile"
+HARVEST_CATALOG = "harvest_catalogs"
+HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"
 
-PROFILE = 'profile'
+log = logging.getLogger(__name__)
 
 
 class FairDataPointCivityHarvester(CivityHarvester):
 
+    def _get_harvest_catalog_setting(self, harvest_config_dict):
+        if HARVEST_CATALOG in harvest_config_dict:
+            log.debug("Using harvest_catalogs from harvest_config_dict")
+            harvest_catalog_setting = toolkit.asbool(
+                harvest_config_dict[HARVEST_CATALOG]
+            )
+        else:
+            log.debug("Using harvest_catalogs from global CKAN config")
+            harvest_catalog_setting = toolkit.asbool(
+                toolkit.config.get(HARVEST_CATALOG_CONFIG, False)
+            )
+        log.debug("Harvesting catalogs is set to %s", harvest_catalog_setting)
+        return harvest_catalog_setting
+
     def setup_record_provider(self, harvest_url, harvest_config_dict):
-        self.record_provider = FairDataPointRecordProvider(harvest_url)
+        # Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config
+        harvest_catalogs = self._get_harvest_catalog_setting(harvest_config_dict)
+
+        self.record_provider = FairDataPointRecordProvider(
+            harvest_url, harvest_catalogs
+        )
 
     def setup_record_to_package_converter(self, harvest_url, harvest_config_dict):
         if PROFILE in harvest_config_dict:
-            self.record_to_package_converter = FairDataPointRecordToPackageConverter(harvest_config_dict.get(PROFILE))
+            self.record_to_package_converter = FairDataPointRecordToPackageConverter(
+                harvest_config_dict.get(PROFILE)
+            )
         else:
-            raise Exception('[{0}] not found in harvester config JSON'.format(PROFILE))
+            raise Exception("[{0}] not found in harvester config JSON".format(PROFILE))
 
     def info(self):
         return {
-            'name': 'fair_data_point_harvester',
-            'title': 'FAIR data point harvester',
-            'description': 'Harvester for end points implementing the FAIR data point protocol'
+            "name": "fair_data_point_harvester",
+            "title": "FAIR data point harvester",
+            "description": "Harvester for end points implementing the FAIR data point protocol",
         }