Skip to content

Commit

Permalink
First iteration of parsing support
Browse files Browse the repository at this point in the history
Created multilingual versions of _object_value() and
_object_value_list() that store the different translations in the format
expected by the fluent fields, e.g.:

{
    "en": "Dataset title",
    "es": "Título del conjunto de datos"
}

and for tags:

{
    "en": ["Oaks", "Pines"],
    "es": ["Robles", "Pinos"],
}

Core fields (those ending in `_translated` are handled separately)
  • Loading branch information
amercader committed Oct 28, 2024
1 parent 43f4411 commit 7b87dd0
Show file tree
Hide file tree
Showing 3 changed files with 224 additions and 30 deletions.
123 changes: 118 additions & 5 deletions ckanext/dcat/profiles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ class RDFProfile(object):

_dataset_schema = None

_form_languages = None

# Cache for mappings of licenses URL/title to ID built when needed in
# _license().
_licenceregister_cache = None
Expand All @@ -145,6 +147,9 @@ def __init__(self, graph, dataset_type="dataset", compatibility_mode=False):

self.compatibility_mode = compatibility_mode

self._default_lang = config.get("ckan.locale_default", "en")


try:
schema_show = get_action("scheming_dataset_schema_show")
try:
Expand All @@ -157,6 +162,9 @@ def __init__(self, graph, dataset_type="dataset", compatibility_mode=False):
except KeyError:
pass

if self._dataset_schema:
self._form_languages = self._dataset_schema.get("form_languages")

def _datasets(self):
"""
Generator that returns all DCAT datasets on the graph
Expand Down Expand Up @@ -201,21 +209,40 @@ def _object(self, subject, predicate):
return _object
return None

def _object_value(self, subject, predicate):
def _object_value(self, subject, predicate, multilingual=False):
"""
Given a subject and a predicate, returns the value of the object
Both subject and predicate must be rdflib URIRef or BNode objects
If found, the string representation is returned, else an empty string
If multilingual is True, a dict with the language codes as keys will be
returned for each language found. e.g.
{
"en": "Dataset title",
"es": "Título del conjunto de datos"
}
If one of the languages defined in `form_languages` in the schema is not
found in the graph, an empty string will be returned.
{
"en": "Dataset title",
"es": ""
}
"""
default_lang = config.get("ckan.locale_default", "en")
if multilingual:
return self._object_value_multilingual(subject, predicate)
fallback = ""
for o in self.g.objects(subject, predicate):
if isinstance(o, Literal):
if o.language and o.language == default_lang:
if o.language and o.language == self._default_lang:
return str(o)
# Use first object as fallback if no object with the default language is available
# Use first object as fallback if no object with the default
# language is available
elif fallback == "":
fallback = str(o)
elif len(list(self.g.objects(o, RDFS.label))):
Expand All @@ -224,6 +251,31 @@ def _object_value(self, subject, predicate):
return str(o)
return fallback

def _object_value_multilingual(self, subject, predicate):
out = {}
for o in self.g.objects(subject, predicate):

if isinstance(o, Literal):
if o.language:
out[o.language] = str(o)
else:
out[self._default_lang] = str(o)
elif len(list(self.g.objects(o, RDFS.label))):
for label in self.g.objects(o, RDFS.label):
if label.language:
out[label.language] = str(label)
else:
out[self._default_lang] = str(label)
else:
out[self._default_lang] = str(o)

if self._form_languages:
for lang in self._form_languages:
if lang not in out:
out[lang] = ""

return out

def _object_value_multiple_predicate(self, subject, predicates):
"""
Given a subject and a list of predicates, returns the value of the object
Expand Down Expand Up @@ -301,10 +353,45 @@ def _object_value_list(self, subject, predicate):
Both subject and predicate must be rdflib URIRef or BNode objects
If no values found, returns an empty string
If no values found, returns an empty list
"""
return [str(o) for o in self.g.objects(subject, predicate)]

def _object_value_list_multilingual(self, subject, predicate):
"""
Given a subject and a predicate, returns a dict with the language codes
as keys and the list of object values as values. e.g.
{
"en": ["Oaks", "Pines"],
"es": ["Robles", "Pinos"],
}
If one of the languages defined in `form_languages` in the schema is not
found in the graph, an empty list will be returned.
{
"en": ["Oaks", "Pines"],
"es": [],
}
Both subject and predicate must be rdflib URIRef or BNode objects
If no values found, returns an empty list
"""
out = {}
for o in self.g.objects(subject, predicate):
lang = o.language or self._default_lang
if lang not in out:
out[lang] = []
out[lang].append(str(o))

if self._form_languages:
for lang in self._form_languages:
if lang not in out:
out[lang] = []
return out

def _get_vcard_property_value(
self, subject, predicate, predicate_string_property=None
):
Expand Down Expand Up @@ -823,6 +910,32 @@ def _schema_resource_field(self, key):
if field["field_name"] == key:
return field

def _multilingual_dataset_fields(self):
"""
Return a list of field names in the dataset shema that have multilingual
values (i.e. that use one of the fluent presets)
"""
return self._multilingual_fields(entity="dataset")

def _multilingual_resource_fields(self):
"""
Return a list of field names in the resource schema that have multilingual
values (i.e. that use one of the fluent presets)
"""
return self._multilingual_fields(entity="resource")

def _multilingual_fields(self, entity="dataset"):
if not self._dataset_schema:
return []

out = []
for field in self._dataset_schema[f"{entity}_fields"]:
if field.get("validators") and any(
v for v in field["validators"].split() if v.startswith("fluent")
):
out.append(field["field_name"])
return out

def _set_dataset_value(self, dataset_dict, key, value):
"""
Sets the value for a given key in a CKAN dataset dict
Expand Down
86 changes: 65 additions & 21 deletions ckanext/dcat/profiles/euro_dcat_ap_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,33 +48,55 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
dataset_dict["extras"] = []
dataset_dict["resources"] = []

multilingual_fields = self._multilingual_dataset_fields()

# Basic fields
for key, predicate in (
("title", DCT.title),
("notes", DCT.description),
("url", DCAT.landingPage),
("version", OWL.versionInfo),
):
value = self._object_value(dataset_ref, predicate)
multilingual = key in multilingual_fields
value = self._object_value(
dataset_ref, predicate, multilingual=multilingual
)
if value:
dataset_dict[key] = value

# Multilingual core fields
for key, predicate in (
("title", DCT.title),
("notes", DCT.description)
):
if f"{key}_translated" in multilingual_fields:
value = self._object_value(dataset_ref, predicate, multilingual=True)
dataset_dict[f"{key}_translated"] = value
dataset_dict[f"{key}"] = value.get(self._default_lang)
else:
value = self._object_value(dataset_ref, predicate)
if value:
dataset_dict[key] = value

if not dataset_dict.get("version"):
# adms:version was supported on the first version of the DCAT-AP
value = self._object_value(dataset_ref, ADMS.version)
if value:
dataset_dict["version"] = value

# Tags
# replace munge_tag to noop if there's no need to clean tags
do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False))
tags_val = [
munge_tag(tag) if do_clean else tag for tag in self._keywords(dataset_ref)
]
tags = [{"name": tag} for tag in tags_val]
dataset_dict["tags"] = tags

# Extras
if "tags_translated" in multilingual_fields:
dataset_dict["tags_translated"] = self._object_value_list_multilingual(
dataset_ref, DCAT.keyword)
dataset_dict["tags"] = [
{"name": t } for t in dataset_dict["tags_translated"][self._default_lang]
]
else:
# replace munge_tag to noop if there's no need to clean tags
do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False))
tags_val = [
munge_tag(tag) if do_clean else tag for tag in self._keywords(dataset_ref)
]
tags = [{"name": tag} for tag in tags_val]
dataset_dict["tags"] = tags

# Simple values
for key, predicate in (
Expand All @@ -86,7 +108,11 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
("provenance", DCT.provenance),
("dcat_type", DCT.type),
):
value = self._object_value(dataset_ref, predicate)

multilingual = key in multilingual_fields
value = self._object_value(
dataset_ref, predicate, multilingual=multilingual
)
if value:
dataset_dict["extras"].append({"key": key, "value": value})

Expand Down Expand Up @@ -175,24 +201,47 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):

resource_dict = {}

multilingual_fields = self._multilingual_resource_fields()

# Simple values
for key, predicate in (
("name", DCT.title),
("description", DCT.description),
("access_url", DCAT.accessURL),
("download_url", DCAT.downloadURL),
("issued", DCT.issued),
("modified", DCT.modified),
("status", ADMS.status),
("license", DCT.license),
("rights", DCT.rights),
):
value = self._object_value(distribution, predicate)
multilingual = key in multilingual_fields
value = self._object_value(
distribution, predicate, multilingual=multilingual
)
if value:
resource_dict[key] = value

# Multilingual core fields
for key, predicate in (
("name", DCT.title),
("description", DCT.description)
):
if f"{key}_translated" in multilingual_fields:
value = self._object_value(
distribution, predicate, multilingual=True
)
resource_dict[f"{key}_translated"] = value
resource_dict[f"{key}"] = value.get(self._default_lang)
else:
value = self._object_value(distribution, predicate)
if value:
resource_dict[key] = value

# URL

resource_dict["url"] = self._object_value(
distribution, DCAT.downloadURL
) or self._object_value(distribution, DCAT.accessURL)

# Lists
for key, predicate in (
("language", DCT.language),
Expand All @@ -203,11 +252,6 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
if values:
resource_dict[key] = json.dumps(values)

# rights
rights = self._access_rights(distribution, DCT.rights)
if rights:
resource_dict["rights"] = rights

# Format and media type
normalize_ckan_format = toolkit.asbool(
config.get("ckanext.dcat.normalize_ckan_format", True)
Expand Down
45 changes: 41 additions & 4 deletions ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,45 @@ def test_e2e_dcat_to_ckan(self):
dataset_dict["name"] = "test-dcat-1"
dataset = call_action("package_create", **dataset_dict)

# Core fields
# Dataset core fields
assert dataset["title"] == "Test DCAT dataset"
assert dataset["title_translated"]["en"] == "Test DCAT dataset"
assert dataset["title_translated"]["ca"] == "Conjunt de dades de prova DCAT"
assert dataset["title_translated"]["es"] == "Conjunto de datos de prueba DCAT"

assert dataset["notes"] == "Some description"
assert dataset["notes_translated"]["en"] == "Some description"
assert dataset["notes_translated"]["ca"] == "Una descripció qualsevol"
assert dataset["notes_translated"]["es"] == "Una descripción cualquiera"

# Tags
assert sorted(dataset["tags_translated"]["en"]) == sorted(["Oaks", "Pines"])
assert sorted(dataset["tags_translated"]["ca"]) == sorted(["Roures", "Pins"])
assert sorted(dataset["tags_translated"]["es"]) == sorted(["Robles", "Pinos"])

# Dataset fields
assert dataset["provenance"]["en"] == "Statement about provenance"
assert dataset["provenance"]["ca"] == "Una declaració sobre la procedència"
assert dataset["provenance"]["es"] == "Una declaración sobre la procedencia"

assert dataset["version_notes"]["en"] == "Some version notes"
assert dataset["version_notes"]["ca"] == "Notes sobre la versió"
assert dataset["version_notes"]["es"] == "Notas sobre la versión"

resource = dataset["resources"][0]

# Resource core fields
assert resource["name"] == "Resource 1"
assert resource["name_translated"]["en"] == "Resource 1"
assert resource["name_translated"]["ca"] == "Recurs 1"
assert resource["name_translated"]["es"] == "Recurso 1"

assert dataset["title"]["en"] == "Test DCAT dataset"
assert dataset["title"]["ca"] == "Conjunt de dades de prova DCAT"
assert dataset["title"]["es"] == "Conjunto de datos de prueba DCAT"
assert resource["description"] == "Some description"
assert resource["description_translated"]["en"] == "Some description"
assert resource["description_translated"]["ca"] == "Una descripció qualsevol"
assert resource["description_translated"]["es"] == "Una descripción cualquiera"

# Resource fields
assert resource["rights"]["en"] == "Some stament about rights"
assert resource["rights"]["ca"] == "Una nota sobre drets"
assert resource["rights"]["es"] == "Una nota sobre derechos"

0 comments on commit 7b87dd0

Please sign in to comment.