From d694918aaffb5ca23fbc8e0186df377460a5733b Mon Sep 17 00:00:00 2001
From: Mark Janse <mark.janse@health-ri.nl>
Date: Mon, 5 Aug 2024 14:30:34 +0200
Subject: [PATCH 1/5] feat: add option for harvesting catalogs

---
 .../domain/fair_data_point_record_provider.py | 69 ++++++++++++-------
 .../fair_data_point_civity_harvester.py       | 39 ++++++++---
 2 files changed, 73 insertions(+), 35 deletions(-)

diff --git a/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py b/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py
index 6e3f7c8..0ae60c9 100644
--- a/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py
+++ b/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py
@@ -18,23 +18,28 @@
 from typing import Dict, Iterable, Union
 
 
-LDP = Namespace('http://www.w3.org/ns/ldp#')
-VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')
+LDP = Namespace("http://www.w3.org/ns/ldp#")
+VCARD = Namespace("http://www.w3.org/2006/vcard/ns#")
 
 log = logging.getLogger(__name__)
 
 
 class FairDataPointRecordProvider:
 
-    def __init__(self, fdp_end_point: str):
+    def __init__(self, fdp_end_point: str, harvest_catalogs: bool = False):
         self.fair_data_point = FairDataPoint(fdp_end_point)
+        self.harvest_catalogs = harvest_catalogs
 
     def get_record_ids(self) -> Dict.keys:
         """
         Returns all the FDP records which should end up as packages in CKAN to populate the "guids_in_harvest" list
         https://rdflib.readthedocs.io/en/stable/intro_to_parsing.html
         """
-        log.debug('FAIR Data Point get_records from {}'.format(self.fair_data_point.fdp_end_point))
+        log.debug(
+            "FAIR Data Point get_records from {}".format(
+                self.fair_data_point.fdp_end_point
+            )
+        )
 
         result = dict()
 
@@ -52,20 +57,21 @@ def _process_catalog(self, path: Union[str, URIRef]) -> Dict:
         catalogs_graph = self.fair_data_point.get_graph(path)
 
         for catalog_subject in catalogs_graph.subjects(RDF.type, DCAT.Catalog):
-            identifier = Identifier('')
+            identifier = Identifier("")
 
-            identifier.add('catalog', str(catalog_subject))
+            identifier.add("catalog", str(catalog_subject))
 
-            result[identifier.guid] = catalog_subject
+            if self.harvest_catalogs:
+                result[identifier.guid] = catalog_subject
 
             catalog_graph = self.fair_data_point.get_graph(catalog_subject)
 
             for dataset_subject in catalog_graph.objects(predicate=DCAT.dataset):
-                identifier = Identifier('')
+                identifier = Identifier("")
 
-                identifier.add('catalog', str(catalog_subject))
+                identifier.add("catalog", str(catalog_subject))
 
-                identifier.add('dataset', str(dataset_subject))
+                identifier.add("dataset", str(dataset_subject))
 
                 result[identifier.guid] = dataset_subject
 
@@ -76,7 +82,10 @@ def get_record_by_id(self, guid: str) -> str:
         Get additional information for FDP record.
         """
         log.debug(
-            'FAIR data point get_record_by_id from {} for {}'.format(self.fair_data_point.fdp_end_point, guid))
+            "FAIR data point get_record_by_id from {} for {}".format(
+                self.fair_data_point.fdp_end_point, guid
+            )
+        )
 
         identifier = Identifier(guid)
 
@@ -89,7 +98,9 @@ def get_record_by_id(self, guid: str) -> str:
         self._remove_fdp_defaults(g, subject_uri)
 
         # Add information from distribution to graph
-        for distribution_uri in g.objects(subject=subject_uri, predicate=DCAT.distribution):
+        for distribution_uri in g.objects(
+            subject=subject_uri, predicate=DCAT.distribution
+        ):
             distribution_g = self.fair_data_point.get_graph(distribution_uri)
 
             self._remove_fdp_defaults(g, distribution_uri)
@@ -99,17 +110,21 @@ def get_record_by_id(self, guid: str) -> str:
                 DCTERMS.format,
                 DCTERMS.license,
                 DCTERMS.title,
-                DCAT.accessURL
+                DCAT.accessURL,
             ]:
-                for distr_attribute_value in self.get_values(distribution_g, distribution_uri, predicate):
+                for distr_attribute_value in self.get_values(
+                    distribution_g, distribution_uri, predicate
+                ):
                     g.add((distribution_uri, predicate, distr_attribute_value))
 
         # Look-up contact information
         for contact_point_uri in self.get_values(g, subject_uri, DCAT.contactPoint):
             if isinstance(contact_point_uri, URIRef):
-                self._parse_contact_point(g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri)
+                self._parse_contact_point(
+                    g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri
+                )
 
-        result = g.serialize(format='ttl')
+        result = g.serialize(format="ttl")
 
         return result
 
@@ -123,19 +138,23 @@ def _parse_contact_point(g: Graph, subject_uri: URIRef, contact_point_uri: URIRe
         g.add((subject_uri, DCAT.contactPoint, vcard_node))
         g.add((vcard_node, RDF.type, VCARD.Kind))
         g.add((vcard_node, VCARD.hasUID, contact_point_uri))
-        if 'orcid' in str(contact_point_uri):
+        if "orcid" in str(contact_point_uri):
             try:
-                orcid_response = requests.get(str(contact_point_uri).rstrip('/') + '/public-record.json')
+                orcid_response = requests.get(
+                    str(contact_point_uri).rstrip("/") + "/public-record.json"
+                )
                 json_orcid_response = orcid_response.json()
-                name = json_orcid_response['displayName']
+                name = json_orcid_response["displayName"]
                 g.add((vcard_node, VCARD.fn, Literal(name)))
             except (JSONDecodeError, HTTPError) as e:
-                log.error(f'Failed to get data from ORCID for {contact_point_uri}: {e}')
+                log.error(f"Failed to get data from ORCID for {contact_point_uri}: {e}")
 
     @staticmethod
-    def get_values(graph: Graph,
-                   subject: Union[str, URIRef, Node],
-                   predicate: Union[str, URIRef, Node]) -> Iterable[Node]:
+    def get_values(
+        graph: Graph,
+        subject: Union[str, URIRef, Node],
+        predicate: Union[str, URIRef, Node],
+    ) -> Iterable[Node]:
         subject_uri = URIRef(subject)
         predicate_uri = URIRef(predicate)
 
@@ -144,8 +163,8 @@ def get_values(graph: Graph,
 
     @staticmethod
     def _remove_fdp_defaults(g, subject_uri):
-        for (s, p, o) in g.triples((subject_uri, DCTERMS.accessRights, None)):
-            access_rights_default = URIRef(f'{subject_uri}#accessRights')
+        for s, p, o in g.triples((subject_uri, DCTERMS.accessRights, None)):
+            access_rights_default = URIRef(f"{subject_uri}#accessRights")
             if o == access_rights_default:
                 g.remove((subject_uri, DCTERMS.accessRights, o))
                 g.remove((access_rights_default, None, None))
diff --git a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
index b91b4a5..b75fded 100644
--- a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
+++ b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
@@ -5,27 +5,46 @@
 
 
 from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
-from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import FairDataPointRecordProvider
-from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import \
-    FairDataPointRecordToPackageConverter
+from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import (
+    FairDataPointRecordProvider,
+)
+from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
+    FairDataPointRecordToPackageConverter,
+)
+from ckan.plugins import toolkit
 
-PROFILE = 'profile'
+PROFILE = "profile"
+HARVEST_CATALOG = "harvest_catalogs"
+HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"
 
 
 class FairDataPointCivityHarvester(CivityHarvester):
 
     def setup_record_provider(self, harvest_url, harvest_config_dict):
-        self.record_provider = FairDataPointRecordProvider(harvest_url)
+        # Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config
+        harvest_catalogs = toolkit.asbool(
+            toolkit.config.get(HARVEST_CATALOG_CONFIG, False)
+        )
+        if HARVEST_CATALOG in harvest_config_dict:
+            harvest_catalogs = toolkit.asbool(
+                harvest_config_dict.get(HARVEST_CATALOG, False)
+            )
+
+        self.record_provider = FairDataPointRecordProvider(
+            harvest_url, harvest_catalogs
+        )
 
     def setup_record_to_package_converter(self, harvest_url, harvest_config_dict):
         if PROFILE in harvest_config_dict:
-            self.record_to_package_converter = FairDataPointRecordToPackageConverter(harvest_config_dict.get(PROFILE))
+            self.record_to_package_converter = FairDataPointRecordToPackageConverter(
+                harvest_config_dict.get(PROFILE)
+            )
         else:
-            raise Exception('[{0}] not found in harvester config JSON'.format(PROFILE))
+            raise Exception("[{0}] not found in harvester config JSON".format(PROFILE))
 
     def info(self):
         return {
-            'name': 'fair_data_point_harvester',
-            'title': 'FAIR data point harvester',
-            'description': 'Harvester for end points implementing the FAIR data point protocol'
+            "name": "fair_data_point_harvester",
+            "title": "FAIR data point harvester",
+            "description": "Harvester for end points implementing the FAIR data point protocol",
         }

From 7972a13aaceb36bbc3780c176f9ab6f44af77fd7 Mon Sep 17 00:00:00 2001
From: Mark Janse <mark.janse@health-ri.nl>
Date: Mon, 5 Aug 2024 15:27:15 +0200
Subject: [PATCH 2/5] Added logging option

---
 .../harvesters/fair_data_point_civity_harvester.py         | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
index b75fded..adde852 100644
--- a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
+++ b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
@@ -2,7 +2,7 @@
 # SPDX-FileContributor: 2024 Stichting Health-RI
 #
 # SPDX-License-Identifier: AGPL-3.0-only
-
+import logging
 
 from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
 from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import (
@@ -17,6 +17,8 @@
 HARVEST_CATALOG = "harvest_catalogs"
 HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"
 
+log = logging.getLogger(__name__)
+
 
 class FairDataPointCivityHarvester(CivityHarvester):
 
@@ -29,6 +31,9 @@ def setup_record_provider(self, harvest_url, harvest_config_dict):
             harvest_catalogs = toolkit.asbool(
                 harvest_config_dict.get(HARVEST_CATALOG, False)
             )
+            log.debug("harvest_catalogs from harvester config: %s", harvest_catalogs)
+        else:
+            log.debug("harvest_catalogs from ckan config: %s", harvest_catalogs)
 
         self.record_provider = FairDataPointRecordProvider(
             harvest_url, harvest_catalogs

From c9b66252e4252c61714b466aadcb961f4e423106 Mon Sep 17 00:00:00 2001
From: Mark Janse <mark.janse@health-ri.nl>
Date: Mon, 5 Aug 2024 15:59:26 +0200
Subject: [PATCH 3/5] Documentation update for harvest_catalogs option

---
 README.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 0383db0..8611017 100644
--- a/README.md
+++ b/README.md
@@ -86,13 +86,11 @@ To install gdi-userportal-ckanext-fairdatapoint:
 
 ## Config settings
 
-None at present
+There is a setting `ckanext.fairdatapoint.harvest_catalogs`. Default is `false`. If set to `true`,
+ckan will harvest catalogs as datasets.
 
-**TODO:** Document any optional config settings here. For example:
-
-	# The minimum number of hours to wait before re-checking a resource
-	# (optional, default: 24).
-	ckanext.fairdatapoint.some_setting = some_default_value
+The setting can be overriden in the harvester profile, by setting `"harvest_catalogs": "true"` or
+`"harvest_catalogs": "false"` in the harvester configuration JSON.
 
 
 ## Developer installation

From 367f5628ce037363e91602ef1b1828a05e808f13 Mon Sep 17 00:00:00 2001
From: Mark Janse <mark.janse@health-ri.nl>
Date: Mon, 5 Aug 2024 16:08:04 +0200
Subject: [PATCH 4/5] Fix capitalization to make bot happy

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8611017..4a1d950 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ To install gdi-userportal-ckanext-fairdatapoint:
 ## Config settings
 
 There is a setting `ckanext.fairdatapoint.harvest_catalogs`. Default is `false`. If set to `true`,
-ckan will harvest catalogs as datasets.
+CKAN will harvest catalogs as datasets.
 
 The setting can be overriden in the harvester profile, by setting `"harvest_catalogs": "true"` or
 `"harvest_catalogs": "false"` in the harvester configuration JSON.

From 8e68b25fe3a7e968cc6300d12d5cd27293ca7198 Mon Sep 17 00:00:00 2001
From: Mark Janse <mark.janse@health-ri.nl>
Date: Tue, 6 Aug 2024 15:38:22 +0200
Subject: [PATCH 5/5] Split setting to different function

---
 .../fair_data_point_civity_harvester.py       | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
index adde852..5d3bd28 100644
--- a/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
+++ b/ckanext/fairdatapoint/harvesters/fair_data_point_civity_harvester.py
@@ -22,18 +22,23 @@
 
 class FairDataPointCivityHarvester(CivityHarvester):
 
-    def setup_record_provider(self, harvest_url, harvest_config_dict):
-        # Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config
-        harvest_catalogs = toolkit.asbool(
-            toolkit.config.get(HARVEST_CATALOG_CONFIG, False)
-        )
+    def _get_harvest_catalog_setting(self, harvest_config_dict):
         if HARVEST_CATALOG in harvest_config_dict:
-            harvest_catalogs = toolkit.asbool(
-                harvest_config_dict.get(HARVEST_CATALOG, False)
+            log.debug("Using harvest_catalogs from harvest_config_dict")
+            harvest_catalog_setting = toolkit.asbool(
+                harvest_config_dict[HARVEST_CATALOG]
             )
-            log.debug("harvest_catalogs from harvester config: %s", harvest_catalogs)
         else:
-            log.debug("harvest_catalogs from ckan config: %s", harvest_catalogs)
+            log.debug("Using harvest_catalogs from global CKAN config")
+            harvest_catalog_setting = toolkit.asbool(
+                toolkit.config.get(HARVEST_CATALOG_CONFIG, False)
+            )
+        log.debug("Harvesting catalogs is set to %s", harvest_catalog_setting)
+        return harvest_catalog_setting
+
+    def setup_record_provider(self, harvest_url, harvest_config_dict):
+        # Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config
+        harvest_catalogs = self._get_harvest_catalog_setting(harvest_config_dict)
 
         self.record_provider = FairDataPointRecordProvider(
             harvest_url, harvest_catalogs