Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make catalog harvesting optional #69

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,11 @@ To install gdi-userportal-ckanext-fairdatapoint:

## Config settings

None at present
There is a setting `ckanext.fairdatapoint.harvest_catalogs`. Default is `false`. If set to `true`,
CKAN will harvest catalogs as datasets.

**TODO:** Document any optional config settings here. For example:

# The minimum number of hours to wait before re-checking a resource
# (optional, default: 24).
ckanext.fairdatapoint.some_setting = some_default_value
The setting can be overriden in the harvester profile, by setting `"harvest_catalogs": "true"` or
`"harvest_catalogs": "false"` in the harvester configuration JSON.


## Developer installation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,28 @@
from typing import Dict, Iterable, Union


LDP = Namespace('http://www.w3.org/ns/ldp#')
VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')
LDP = Namespace("http://www.w3.org/ns/ldp#")
VCARD = Namespace("http://www.w3.org/2006/vcard/ns#")

log = logging.getLogger(__name__)


class FairDataPointRecordProvider:

def __init__(self, fdp_end_point: str):
def __init__(self, fdp_end_point: str, harvest_catalogs: bool = False):
self.fair_data_point = FairDataPoint(fdp_end_point)
self.harvest_catalogs = harvest_catalogs

def get_record_ids(self) -> Dict.keys:
"""
Returns all the FDP records which should end up as packages in CKAN to populate the "guids_in_harvest" list
https://rdflib.readthedocs.io/en/stable/intro_to_parsing.html
"""
log.debug('FAIR Data Point get_records from {}'.format(self.fair_data_point.fdp_end_point))
log.debug(
"FAIR Data Point get_records from {}".format(
self.fair_data_point.fdp_end_point
)
)

result = dict()

Expand All @@ -52,20 +57,21 @@ def _process_catalog(self, path: Union[str, URIRef]) -> Dict:
catalogs_graph = self.fair_data_point.get_graph(path)

for catalog_subject in catalogs_graph.subjects(RDF.type, DCAT.Catalog):
identifier = Identifier('')
identifier = Identifier("")

identifier.add('catalog', str(catalog_subject))
identifier.add("catalog", str(catalog_subject))

result[identifier.guid] = catalog_subject
if self.harvest_catalogs:
result[identifier.guid] = catalog_subject

catalog_graph = self.fair_data_point.get_graph(catalog_subject)

for dataset_subject in catalog_graph.objects(predicate=DCAT.dataset):
identifier = Identifier('')
identifier = Identifier("")

identifier.add('catalog', str(catalog_subject))
identifier.add("catalog", str(catalog_subject))

identifier.add('dataset', str(dataset_subject))
identifier.add("dataset", str(dataset_subject))

result[identifier.guid] = dataset_subject

Expand All @@ -76,7 +82,10 @@ def get_record_by_id(self, guid: str) -> str:
Get additional information for FDP record.
"""
log.debug(
'FAIR data point get_record_by_id from {} for {}'.format(self.fair_data_point.fdp_end_point, guid))
"FAIR data point get_record_by_id from {} for {}".format(
self.fair_data_point.fdp_end_point, guid
)
)

identifier = Identifier(guid)

Expand All @@ -89,7 +98,9 @@ def get_record_by_id(self, guid: str) -> str:
self._remove_fdp_defaults(g, subject_uri)

# Add information from distribution to graph
for distribution_uri in g.objects(subject=subject_uri, predicate=DCAT.distribution):
for distribution_uri in g.objects(
subject=subject_uri, predicate=DCAT.distribution
):
distribution_g = self.fair_data_point.get_graph(distribution_uri)

self._remove_fdp_defaults(g, distribution_uri)
Expand All @@ -99,17 +110,21 @@ def get_record_by_id(self, guid: str) -> str:
DCTERMS.format,
DCTERMS.license,
DCTERMS.title,
DCAT.accessURL
DCAT.accessURL,
]:
for distr_attribute_value in self.get_values(distribution_g, distribution_uri, predicate):
for distr_attribute_value in self.get_values(
distribution_g, distribution_uri, predicate
):
g.add((distribution_uri, predicate, distr_attribute_value))

# Look-up contact information
for contact_point_uri in self.get_values(g, subject_uri, DCAT.contactPoint):
if isinstance(contact_point_uri, URIRef):
self._parse_contact_point(g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri)
self._parse_contact_point(
g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri
)

result = g.serialize(format='ttl')
result = g.serialize(format="ttl")

return result

Expand All @@ -123,19 +138,23 @@ def _parse_contact_point(g: Graph, subject_uri: URIRef, contact_point_uri: URIRe
g.add((subject_uri, DCAT.contactPoint, vcard_node))
g.add((vcard_node, RDF.type, VCARD.Kind))
g.add((vcard_node, VCARD.hasUID, contact_point_uri))
if 'orcid' in str(contact_point_uri):
if "orcid" in str(contact_point_uri):
try:
orcid_response = requests.get(str(contact_point_uri).rstrip('/') + '/public-record.json')
orcid_response = requests.get(
str(contact_point_uri).rstrip("/") + "/public-record.json"
)
json_orcid_response = orcid_response.json()
name = json_orcid_response['displayName']
name = json_orcid_response["displayName"]
g.add((vcard_node, VCARD.fn, Literal(name)))
except (JSONDecodeError, HTTPError) as e:
log.error(f'Failed to get data from ORCID for {contact_point_uri}: {e}')
log.error(f"Failed to get data from ORCID for {contact_point_uri}: {e}")

@staticmethod
def get_values(graph: Graph,
subject: Union[str, URIRef, Node],
predicate: Union[str, URIRef, Node]) -> Iterable[Node]:
def get_values(
graph: Graph,
subject: Union[str, URIRef, Node],
predicate: Union[str, URIRef, Node],
) -> Iterable[Node]:
subject_uri = URIRef(subject)
predicate_uri = URIRef(predicate)

Expand All @@ -144,8 +163,8 @@ def get_values(graph: Graph,

@staticmethod
def _remove_fdp_defaults(g, subject_uri):
for (s, p, o) in g.triples((subject_uri, DCTERMS.accessRights, None)):
access_rights_default = URIRef(f'{subject_uri}#accessRights')
for s, p, o in g.triples((subject_uri, DCTERMS.accessRights, None)):
access_rights_default = URIRef(f"{subject_uri}#accessRights")
if o == access_rights_default:
g.remove((subject_uri, DCTERMS.accessRights, o))
g.remove((access_rights_default, None, None))
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,59 @@
# SPDX-FileContributor: 2024 Stichting Health-RI
#
# SPDX-License-Identifier: AGPL-3.0-only

import logging
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (complexity): Consider simplifying the code while maintaining the new functionality.

The new code introduces useful functionality but also adds complexity. Here are some points to consider:

  1. Increased Complexity: The new code has more lines and nested conditions, making it harder to read and understand at a glance. The original code was more straightforward.

  2. Logging and Configuration Handling: While logging is useful, it adds complexity by introducing more logic and potential points of failure. The original code did not have this additional layer of configuration handling.

  3. Use of toolkit: The new code uses toolkit to fetch configuration values, adding another dependency and layer of abstraction. This makes the code harder to maintain.

  4. Conditional Logic: The new code has more conditional logic to handle different sources of configuration (global CKAN level vs harvest config). This increases the cognitive load required to understand the flow of the program.

Consider simplifying the code while maintaining the new functionality. Here is a suggestion:

import logging
from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import (
    FairDataPointRecordProvider,
)
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
    FairDataPointRecordToPackageConverter,
)
from ckan.plugins import toolkit

PROFILE = "profile"
HARVEST_CATALOG = "harvest_catalogs"
HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"

log = logging.getLogger(__name__)

class FairDataPointCivityHarvester(CivityHarvester):
    def setup_record_provider(self, harvest_url, harvest_config_dict):
        # Determine harvest_catalogs from config or default to False
        harvest_catalogs = toolkit.asbool(
            harvest_config_dict.get(HARVEST_CATALOG, toolkit.config.get(HARVEST_CATALOG_CONFIG, False))
        )
        log.debug("harvest_catalogs: %s", harvest_catalogs)

        self.record_provider = FairDataPointRecordProvider(harvest_url, harvest_catalogs)

    def setup_record_to_package_converter(self, harvest_url, harvest_config_dict):
        if PROFILE in harvest_config_dict:
            self.record_to_package_converter = FairDataPointRecordToPackageConverter(
                harvest_config_dict.get(PROFILE)
            )
        else:
            raise Exception("[{0}] not found in harvester config JSON".format(PROFILE))

    def info(self):
        return {
            "name": "fair_data_point_harvester",
            "title": "FAIR data point harvester",
            "description": "Harvester for end points implementing the FAIR data point protocol",
        }

This approach reduces complexity while maintaining the new functionality.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi bot, I wouldn't exactly call nested get statements more readable. The entire plugin is a mess when it comes to the style guide, but to keep the diffs small I'm not running a full linter over it yet.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Markus92,

I understand your concern about nested get statements potentially reducing readability. The goal of my suggestion was to streamline the logic and reduce the cognitive load by consolidating the configuration fetching into a single line. However, I see your point about the overall style and the importance of keeping the diffs small.

If running a full linter over the entire plugin isn't feasible right now, perhaps we can find a middle ground. We could refactor the configuration fetching into a separate method to improve readability without introducing too many changes at once. Here's an example:

import logging
from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import (
    FairDataPointRecordProvider,
)
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
    FairDataPointRecordToPackageConverter,
)
from ckan.plugins import toolkit

PROFILE = "profile"
HARVEST_CATALOG = "harvest_catalogs"
HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"

log = logging.getLogger(__name__)

class FairDataPointCivityHarvester(CivityHarvester):
    def setup_record_provider(self, harvest_url, harvest_config_dict):
        harvest_catalogs = self._get_harvest_catalogs(harvest_config_dict)
        log.debug("harvest_catalogs: %s", harvest_catalogs)

        self.record_provider = FairDataPointRecordProvider(harvest_url, harvest_catalogs)

    def setup_record_to_package_converter(self, harvest_url, harvest_config_dict):
        if PROFILE in harvest_config_dict:
            self.record_to_package_converter = FairDataPointRecordToPackageConverter(
                harvest_config_dict.get(PROFILE)
            )
        else:
            raise Exception("[{0}] not found in harvester config JSON".format(PROFILE))

    def info(self):
        return {
            "name": "fair_data_point_harvester",
            "title": "FAIR data point harvester",
            "description": "Harvester for end points implementing the FAIR data point protocol",
        }

    def _get_harvest_catalogs(self, harvest_config_dict):
        return toolkit.asbool(
            harvest_config_dict.get(HARVEST_CATALOG, toolkit.config.get(HARVEST_CATALOG_CONFIG, False))
        )

This way, the main methods remain clean, and the configuration logic is encapsulated in a separate method. Let me know what you think!

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But now you removed the logging. How can I see where the configuration came from? That would be extremely useful for debugging issues when the wrong behavior is shown.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Markus92,

I understand the importance of logging for debugging purposes. Let's incorporate the logging back into the refactored method to ensure we can trace where the configuration values are coming from. Here's an updated version that includes detailed logging:

import logging
from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import (
    FairDataPointRecordProvider,
)
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
    FairDataPointRecordToPackageConverter,
)
from ckan.plugins import toolkit

PROFILE = "profile"
HARVEST_CATALOG = "harvest_catalogs"
HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"

log = logging.getLogger(__name__)

class FairDataPointCivityHarvester(CivityHarvester):
    def setup_record_provider(self, harvest_url, harvest_config_dict):
        harvest_catalogs = self._get_harvest_catalogs(harvest_config_dict)
        log.debug("harvest_catalogs: %s", harvest_catalogs)

        self.record_provider = FairDataPointRecordProvider(harvest_url, harvest_catalogs)

    def setup_record_to_package_converter(self, harvest_url, harvest_config_dict):
        if PROFILE in harvest_config_dict:
            self.record_to_package_converter = FairDataPointRecordToPackageConverter(
                harvest_config_dict.get(PROFILE)
            )
        else:
            raise Exception("[{0}] not found in harvester config JSON".format(PROFILE))

    def info(self):
        return {
            "name": "fair_data_point_harvester",
            "title": "FAIR data point harvester",
            "description": "Harvester for end points implementing the FAIR data point protocol",
        }

    def _get_harvest_catalogs(self, harvest_config_dict):
        if HARVEST_CATALOG in harvest_config_dict:
            log.debug("Using harvest_catalogs from harvest_config_dict")
            return toolkit.asbool(harvest_config_dict[HARVEST_CATALOG])
        else:
            log.debug("Using harvest_catalogs from global CKAN config")
            return toolkit.asbool(toolkit.config.get(HARVEST_CATALOG_CONFIG, False))

This way, we maintain the readability improvements while ensuring that the logging provides the necessary information for debugging. Let me know if this approach works for you!


from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import FairDataPointRecordProvider
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import \
FairDataPointRecordToPackageConverter
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import (
FairDataPointRecordProvider,
)
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import (
FairDataPointRecordToPackageConverter,
)
from ckan.plugins import toolkit

PROFILE = "profile"
HARVEST_CATALOG = "harvest_catalogs"
HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"

PROFILE = 'profile'
log = logging.getLogger(__name__)


class FairDataPointCivityHarvester(CivityHarvester):

def _get_harvest_catalog_setting(self, harvest_config_dict):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (complexity): Consider simplifying the configuration handling and removing unnecessary logging.

The new code introduces additional complexity due to extra logging, a new method for configuration handling, and more verbose import statements. While these changes add functionality, they also make the code harder to read and maintain. Consider simplifying the configuration handling by doing it inline within the setup_record_provider method and removing unnecessary logging. Here's a simplified version that maintains the new functionality:

from ckanext.fairdatapoint.harvesters.civity_harvester import CivityHarvester
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_provider import FairDataPointRecordProvider
from ckanext.fairdatapoint.harvesters.domain.fair_data_point_record_to_package_converter import FairDataPointRecordToPackageConverter
from ckan.plugins import toolkit

PROFILE = "profile"
HARVEST_CATALOG = "harvest_catalogs"
HARVEST_CATALOG_CONFIG = "ckanext.fairdatapoint.harvest_catalogs"

class FairDataPointCivityHarvester(CivityHarvester):

    def setup_record_provider(self, harvest_url, harvest_config_dict):
        harvest_catalogs = toolkit.asbool(
            harvest_config_dict.get(HARVEST_CATALOG, toolkit.config.get(HARVEST_CATALOG_CONFIG, False))
        )
        self.record_provider = FairDataPointRecordProvider(harvest_url, harvest_catalogs)

    def setup_record_to_package_converter(self, harvest_url, harvest_config_dict):
        if PROFILE in harvest_config_dict:
            self.record_to_package_converter = FairDataPointRecordToPackageConverter(
                harvest_config_dict.get(PROFILE)
            )
        else:
            raise Exception(f"[{PROFILE}] not found in harvester config JSON")

    def info(self):
        return {
            "name": "fair_data_point_harvester",
            "title": "FAIR data point harvester",
            "description": "Harvester for end points implementing the FAIR data point protocol",
        }

This version reduces complexity while keeping the new features intact.

if HARVEST_CATALOG in harvest_config_dict:
log.debug("Using harvest_catalogs from harvest_config_dict")
harvest_catalog_setting = toolkit.asbool(
harvest_config_dict[HARVEST_CATALOG]
)
else:
log.debug("Using harvest_catalogs from global CKAN config")
harvest_catalog_setting = toolkit.asbool(
toolkit.config.get(HARVEST_CATALOG_CONFIG, False)
)
log.debug("Harvesting catalogs is set to %s", harvest_catalog_setting)
return harvest_catalog_setting

def setup_record_provider(self, harvest_url, harvest_config_dict):
self.record_provider = FairDataPointRecordProvider(harvest_url)
# Harvest catalog config can be set on global CKAN level, but can be overriden by harvest config
harvest_catalogs = self._get_harvest_catalog_setting(harvest_config_dict)

self.record_provider = FairDataPointRecordProvider(
harvest_url, harvest_catalogs
)

def setup_record_to_package_converter(self, harvest_url, harvest_config_dict):
if PROFILE in harvest_config_dict:
self.record_to_package_converter = FairDataPointRecordToPackageConverter(harvest_config_dict.get(PROFILE))
self.record_to_package_converter = FairDataPointRecordToPackageConverter(
harvest_config_dict.get(PROFILE)
)
else:
raise Exception('[{0}] not found in harvester config JSON'.format(PROFILE))
raise Exception("[{0}] not found in harvester config JSON".format(PROFILE))

def info(self):
return {
'name': 'fair_data_point_harvester',
'title': 'FAIR data point harvester',
'description': 'Harvester for end points implementing the FAIR data point protocol'
"name": "fair_data_point_harvester",
"title": "FAIR data point harvester",
"description": "Harvester for end points implementing the FAIR data point protocol",
}