diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 06b25f56..1dffeb98 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -71,6 +71,8 @@ jobs: pip install -r ckanext-harvest/requirements.txt git clone https://github.com/ckan/ckanext-scheming pip install -e ckanext-scheming + git clone https://github.com/ckan/ckanext-fluent + pip install -e ckanext-fluent - name: Setup extension run: | ckan -c test.ini db init diff --git a/README.md b/README.md index 02a0aa13..9f7e85ec 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ To run the tests do: pytest --ckan-ini=test.ini ckanext/dcat/tests +Note that there are tests relying on having [ckanext-harvest](https://github.com/ckan/ckanext-harvest), [ckanext-scheming](https://github.com/ckan/ckanext-scheming) and [ckanext-fluent](https://github.com/ckan/ckanext-fluent) installed. + ## Releases To create a new release, follow these steps: diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 08c78d0d..e10fda88 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -69,7 +69,7 @@ class URIRefOrLiteral(object): Like CleanedURIRef, this is a factory class. """ - def __new__(cls, value): + def __new__(cls, value, lang=None): try: stripped_value = value.strip() if isinstance(value, str) and ( @@ -83,10 +83,10 @@ def __new__(cls, value): # URI is fine, return the object return uri_obj else: - return Literal(value) + return Literal(value, lang=lang) except Exception: # In case something goes wrong: use Literal - return Literal(value) + return Literal(value, lang=lang) class CleanedURIRef(object): @@ -123,6 +123,8 @@ class RDFProfile(object): _dataset_schema = None + _form_languages = None + # Cache for mappings of licenses URL/title to ID built when needed in # _license(). _licenceregister_cache = None @@ -145,6 +147,9 @@ def __init__(self, graph, dataset_type="dataset", compatibility_mode=False): self.compatibility_mode = compatibility_mode + self._default_lang = config.get("ckan.locale_default", "en") + + try: schema_show = get_action("scheming_dataset_schema_show") try: @@ -157,6 +162,9 @@ def __init__(self, graph, dataset_type="dataset", compatibility_mode=False): except KeyError: pass + if self._dataset_schema: + self._form_languages = self._dataset_schema.get("form_languages") + def _datasets(self): """ Generator that returns all DCAT datasets on the graph @@ -201,21 +209,40 @@ def _object(self, subject, predicate): return _object return None - def _object_value(self, subject, predicate): + def _object_value(self, subject, predicate, multilingual=False): """ Given a subject and a predicate, returns the value of the object Both subject and predicate must be rdflib URIRef or BNode objects If found, the string representation is returned, else an empty string + + If multilingual is True, a dict with the language codes as keys will be + returned for each language found. e.g. + + { + "en": "Dataset title", + "es": "Título del conjunto de datos" + } + + If one of the languages defined in `form_languages` in the schema is not + found in the graph, an empty string will be returned. + + { + "en": "Dataset title", + "es": "" + } + """ - default_lang = config.get("ckan.locale_default", "en") + if multilingual: + return self._object_value_multilingual(subject, predicate) fallback = "" for o in self.g.objects(subject, predicate): if isinstance(o, Literal): - if o.language and o.language == default_lang: + if o.language and o.language == self._default_lang: return str(o) - # Use first object as fallback if no object with the default language is available + # Use first object as fallback if no object with the default + # language is available elif fallback == "": fallback = str(o) elif len(list(self.g.objects(o, RDFS.label))): @@ -224,6 +251,31 @@ def _object_value(self, subject, predicate): return str(o) return fallback + def _object_value_multilingual(self, subject, predicate): + out = {} + for o in self.g.objects(subject, predicate): + + if isinstance(o, Literal): + if o.language: + out[o.language] = str(o) + else: + out[self._default_lang] = str(o) + elif len(list(self.g.objects(o, RDFS.label))): + for label in self.g.objects(o, RDFS.label): + if label.language: + out[label.language] = str(label) + else: + out[self._default_lang] = str(label) + else: + out[self._default_lang] = str(o) + + if self._form_languages: + for lang in self._form_languages: + if lang not in out: + out[lang] = "" + + return out + def _object_value_multiple_predicate(self, subject, predicates): """ Given a subject and a list of predicates, returns the value of the object @@ -301,10 +353,45 @@ def _object_value_list(self, subject, predicate): Both subject and predicate must be rdflib URIRef or BNode objects - If no values found, returns an empty string + If no values found, returns an empty list """ return [str(o) for o in self.g.objects(subject, predicate)] + def _object_value_list_multilingual(self, subject, predicate): + """ + Given a subject and a predicate, returns a dict with the language codes + as keys and the list of object values as values. e.g. + + { + "en": ["Oaks", "Pines"], + "es": ["Robles", "Pinos"], + } + + If one of the languages defined in `form_languages` in the schema is not + found in the graph, an empty list will be returned. + + { + "en": ["Oaks", "Pines"], + "es": [], + } + + Both subject and predicate must be rdflib URIRef or BNode objects + + If no values found, returns an empty list + """ + out = {} + for o in self.g.objects(subject, predicate): + lang = o.language or self._default_lang + if lang not in out: + out[lang] = [] + out[lang].append(str(o)) + + if self._form_languages: + for lang in self._form_languages: + if lang not in out: + out[lang] = [] + return out + def _get_vcard_property_value( self, subject, predicate, predicate_string_property=None ): @@ -786,18 +873,25 @@ def _add_statement_to_graph(self, data_dict, key, subject, predicate, _class=Non """ value = self._get_dict_value(data_dict, key) if value: - _object = URIRefOrLiteral(value) - if isinstance(_object, Literal): - statement_ref = BNode() - self.g.add((subject, predicate, statement_ref)) - if _class: - self.g.add((statement_ref, RDF.type, _class)) - self.g.add((statement_ref, RDFS.label, _object)) - + if isinstance(value, dict): + _objects = [] + for lang in value: + _objects.append(URIRefOrLiteral(value[lang], lang)) else: - self.g.add((subject, predicate, _object)) - if _class: - self.g.add((_object, RDF.type, _class)) + _objects = [URIRefOrLiteral(value)] + statement_ref = None + for _object in _objects: + if isinstance(_object, Literal): + if not statement_ref: + statement_ref = BNode() + self.g.add((subject, predicate, statement_ref)) + if _class: + self.g.add((statement_ref, RDF.type, _class)) + self.g.add((statement_ref, RDFS.label, _object)) + else: + self.g.add((subject, predicate, _object)) + if _class: + self.g.add((_object, RDF.type, _class)) def _schema_field(self, key): """ @@ -823,6 +917,32 @@ def _schema_resource_field(self, key): if field["field_name"] == key: return field + def _multilingual_dataset_fields(self): + """ + Return a list of field names in the dataset shema that have multilingual + values (i.e. that use one of the fluent presets) + """ + return self._multilingual_fields(entity="dataset") + + def _multilingual_resource_fields(self): + """ + Return a list of field names in the resource schema that have multilingual + values (i.e. that use one of the fluent presets) + """ + return self._multilingual_fields(entity="resource") + + def _multilingual_fields(self, entity="dataset"): + if not self._dataset_schema: + return [] + + out = [] + for field in self._dataset_schema[f"{entity}_fields"]: + if field.get("validators") and any( + v for v in field["validators"].split() if v.startswith("fluent") + ): + out.append(field["field_name"]) + return out + def _set_dataset_value(self, dataset_dict, key, value): """ Sets the value for a given key in a CKAN dataset dict @@ -949,7 +1069,16 @@ def _add_triple_from_dict( elif value and date_value: self._add_date_triple(subject, predicate, value, _type) elif value: + # If it is a dict, we assume it's a fluent multilingual field + if isinstance(value, dict): + # We assume that all translated field values are Literals + for lang, translated_value in value.items(): + object = Literal(translated_value, datatype=_datatype, lang=lang) + self.g.add((subject, predicate, object)) + return + # Normal text value + # ensure URIRef items are preprocessed (space removal/url encoding) if _type == URIRef: _type = CleanedURIRef diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index 57458c6a..2356a2d4 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -48,17 +48,34 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): dataset_dict["extras"] = [] dataset_dict["resources"] = [] + multilingual_fields = self._multilingual_dataset_fields() + # Basic fields for key, predicate in ( - ("title", DCT.title), - ("notes", DCT.description), ("url", DCAT.landingPage), ("version", OWL.versionInfo), ): - value = self._object_value(dataset_ref, predicate) + multilingual = key in multilingual_fields + value = self._object_value( + dataset_ref, predicate, multilingual=multilingual + ) if value: dataset_dict[key] = value + # Multilingual core fields + for key, predicate in ( + ("title", DCT.title), + ("notes", DCT.description) + ): + if f"{key}_translated" in multilingual_fields: + value = self._object_value(dataset_ref, predicate, multilingual=True) + dataset_dict[f"{key}_translated"] = value + dataset_dict[f"{key}"] = value.get(self._default_lang) + else: + value = self._object_value(dataset_ref, predicate) + if value: + dataset_dict[key] = value + if not dataset_dict.get("version"): # adms:version was supported on the first version of the DCAT-AP value = self._object_value(dataset_ref, ADMS.version) @@ -66,15 +83,20 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): dataset_dict["version"] = value # Tags - # replace munge_tag to noop if there's no need to clean tags - do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False)) - tags_val = [ - munge_tag(tag) if do_clean else tag for tag in self._keywords(dataset_ref) - ] - tags = [{"name": tag} for tag in tags_val] - dataset_dict["tags"] = tags - - # Extras + if "tags_translated" in multilingual_fields: + dataset_dict["tags_translated"] = self._object_value_list_multilingual( + dataset_ref, DCAT.keyword) + dataset_dict["tags"] = [ + {"name": t } for t in dataset_dict["tags_translated"][self._default_lang] + ] + else: + # replace munge_tag to noop if there's no need to clean tags + do_clean = toolkit.asbool(config.get(DCAT_CLEAN_TAGS, False)) + tags_val = [ + munge_tag(tag) if do_clean else tag for tag in self._keywords(dataset_ref) + ] + tags = [{"name": tag} for tag in tags_val] + dataset_dict["tags"] = tags # Simple values for key, predicate in ( @@ -86,7 +108,11 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): ("provenance", DCT.provenance), ("dcat_type", DCT.type), ): - value = self._object_value(dataset_ref, predicate) + + multilingual = key in multilingual_fields + value = self._object_value( + dataset_ref, predicate, multilingual=multilingual + ) if value: dataset_dict["extras"].append({"key": key, "value": value}) @@ -185,24 +211,47 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): resource_dict = {} + multilingual_fields = self._multilingual_resource_fields() + # Simple values for key, predicate in ( - ("name", DCT.title), - ("description", DCT.description), ("access_url", DCAT.accessURL), ("download_url", DCAT.downloadURL), ("issued", DCT.issued), ("modified", DCT.modified), ("status", ADMS.status), ("license", DCT.license), + ("rights", DCT.rights), ): - value = self._object_value(distribution, predicate) + multilingual = key in multilingual_fields + value = self._object_value( + distribution, predicate, multilingual=multilingual + ) if value: resource_dict[key] = value + # Multilingual core fields + for key, predicate in ( + ("name", DCT.title), + ("description", DCT.description) + ): + if f"{key}_translated" in multilingual_fields: + value = self._object_value( + distribution, predicate, multilingual=True + ) + resource_dict[f"{key}_translated"] = value + resource_dict[f"{key}"] = value.get(self._default_lang) + else: + value = self._object_value(distribution, predicate) + if value: + resource_dict[key] = value + + # URL + resource_dict["url"] = self._object_value( distribution, DCAT.downloadURL ) or self._object_value(distribution, DCAT.accessURL) + # Lists for key, predicate in ( ("language", DCT.language), @@ -213,11 +262,6 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): if values: resource_dict[key] = json.dumps(values) - # rights - rights = self._access_rights(distribution, DCT.rights) - if rights: - resource_dict["rights"] = rights - # Format and media type normalize_ckan_format = toolkit.asbool( config.get("ckanext.dcat.normalize_ckan_format", True) @@ -284,9 +328,19 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields + title_key = ( + "title_translated" + if "title_translated" in dataset_dict + else "title" + ) + notes_key = ( + "notes_translated" + if "notes_translated" in dataset_dict + else "notes" + ) items = [ - ("title", DCT.title, None, Literal), - ("notes", DCT.description, None, Literal), + (title_key, DCT.title, None, Literal), + (notes_key, DCT.description, None, Literal), ("url", DCAT.landingPage, None, URIRef, FOAF.Document), ("identifier", DCT.identifier, ["guid", "id"], URIRefOrLiteral), ("version", OWL.versionInfo, ["dcat_version"], Literal), @@ -297,8 +351,13 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): self._add_triples_from_dict(dataset_dict, dataset_ref, items) # Tags - for tag in dataset_dict.get("tags", []): - g.add((dataset_ref, DCAT.keyword, Literal(tag["name"]))) + if "tags_translated" in dataset_dict: + for lang in dataset_dict["tags_translated"]: + for value in dataset_dict["tags_translated"][lang]: + g.add((dataset_ref, DCAT.keyword, Literal(value, lang=lang))) + else: + for tag in dataset_dict.get("tags", []): + g.add((dataset_ref, DCAT.keyword, Literal(tag["name"]))) # Dates items = [ @@ -535,9 +594,18 @@ def _graph_from_dataset_base(self, dataset_dict, dataset_ref): g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values + name_key = ( + "name_translated" if "name_translated" in resource_dict else "name" + ) + description_key = ( + "description_translated" + if "description_translated" in resource_dict + else "description" + ) + items = [ - ("name", DCT.title, None, Literal), - ("description", DCT.description, None, Literal), + (name_key, DCT.title, None, Literal), + (description_key, DCT.description, None, Literal), ("status", ADMS.status, None, URIRefOrLiteral), ("license", DCT.license, None, URIRefOrLiteral, DCT.LicenseDocument), ("access_url", DCAT.accessURL, None, URIRef, RDFS.Resource), diff --git a/ckanext/dcat/schemas/dcat_ap_multilingual.yaml b/ckanext/dcat/schemas/dcat_ap_multilingual.yaml new file mode 100644 index 00000000..63c07c7a --- /dev/null +++ b/ckanext/dcat/schemas/dcat_ap_multilingual.yaml @@ -0,0 +1,266 @@ +scheming_version: 2 +dataset_type: dataset +about: DCAT AP schema with multilingual fields using ckanext-fluent +about_url: http://github.com/ckan/ckanext-dcat +form_languages: [en, ca, es] +dataset_fields: +- field_name: title_translated + label: + en: Title + ca: Títol + es: Título + fluent_form_label: + en: + en: Title (English) + ca: Títol (Anglès) + es: Título (Inglés) + ca: + en: Title (Catalan) + ca: Títol (Català) + es: Título (Catalán) + es: + en: Title (Spanish) + ca: Títol (Espanyol) + es: Título (Español) + preset: fluent_core_translated + required: true + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes_translated + label: + en: Description + ca: Descripció + es: Descripción + fluent_form_label: + en: + en: Description (English) + ca: Descripció (Anglès) + es: Descripción (Inglés) + ca: + en: Description (Catalan) + ca: Descripció (Català) + es: Descripción (Catalán) + es: + en: Description (Spanish) + ca: Descripció (Espanyol) + es: Descripción (Español) + preset: fluent_core_translated + form_snippet: fluent_markdown.html + display_snippet: fluent_markdown.html + required: true + +- field_name: tags_translated + label: + en: Keywords + ca: Mots clau + es: Palabras clave + fluent_form_label: + en: + en: Keywords (English) + ca: Paraules clau (Anglès) + es: Palabras clave (Inglés) + ca: + en: Keywords (Catalan) + ca: Paraules clau (Català) + es: Palabras clave (Catalán) + es: + en: Keywords (Spanish) + ca: Paraules clau (Espanyol) + es: Palabras clave (Español) + preset: fluent_tags + fluent_help_text: + en: + en: Keywords or tags describing the dataset. Use commas to separate multiple values (English). + ca: Paraules clau o etiquetes per descriure el conjunt de dades. Useu comes per separar-les (Anglès). + es: Palabras clave o etiquetas para describir el conjunto de datos. Usen comas para separarlas (Inglés). + ca: + en: Keywords or tags describing the dataset. Use commas to separate multiple values (Catalan). + ca: Paraules clau o etiquetes per descriure el conjunt de dades. Useu comes per separar-les (Català). + es: Palabras clave o etiquetas para describir el conjunto de datos. Usen comas para separarlas (Catalan). + es: + en: Keywords or tags describing the dataset. Use commas to separate multiple values (Spanish). + ca: Paraules clau o etiquetes per descriure el conjunt de dades. Useu comes per separar-les (Espanyol). + es: Palabras clave o etiquetas para describir el conjunto de datos. Usen comas para separarlas (Español). + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the contact point. Such as a ROR ID. + + help_text: Contact information for enquiries about the dataset. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: url + label: Landing page + form_placeholder: http://example.com/dataset.json + display_snippet: link.html + help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information. + + # Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the dataset. + + # Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + help_text: Version number or other version designation of the dataset. + +- field_name: version_notes + preset: fluent_markdown + label: + en: Version notes + ca: Notes sobre la versió + es: Notas sobre la versión + +- field_name: provenance + preset: fluent_markdown + label: + en: Provenance + ca: Procedència + es: Procedencia + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the dataset. + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this dataset. + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this dataset (if not provided it will be autogenerated). + +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) +# +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name_translated + label: + en: Title + ca: Títol + es: Título + fluent_form_label: + en: + en: Title (English) + ca: Títol (Anglès) + es: Título (Inglés) + ca: + en: Title (Catalan) + ca: Títol (Català) + es: Título (Catalán) + es: + en: Title (Spanish) + ca: Títol (Espanyol) + es: Título (Español) + preset: fluent_core_translated + required: true + +- field_name: description_translated + label: + en: Description + ca: Descripció + es: Descripción + fluent_form_label: + en: + en: Description (English) + ca: Descripció (Anglès) + es: Descripción (Inglés) + ca: + en: Description (Catalan) + ca: Descripció (Català) + es: Descripción (Catalán) + es: + en: Description (Spanish) + ca: Descripció (Espanyol) + es: Descripción (Español) + preset: fluent_core_translated + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: rights + preset: fluent_markdown + label: + en: Rights + ca: Drets + es: Derechos + + # Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + help_text: URL that gives access to the dataset (defaults to the standard resource URL). + + # Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + display_snippet: link.html + help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). + +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the resource. + +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the resource was changed, updated or modified. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the resource. + + # Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this resource (if not provided it will be autogenerated). diff --git a/ckanext/dcat/tests/profiles/base/test_base_profile.py b/ckanext/dcat/tests/profiles/base/test_base_profile.py index fb08f51e..aa8daf1e 100644 --- a/ckanext/dcat/tests/profiles/base/test_base_profile.py +++ b/ckanext/dcat/tests/profiles/base/test_base_profile.py @@ -247,6 +247,226 @@ def test_object_list_not_found(self): assert isinstance(value, list) assert value == [] + def test_object_value_multilingual(self): + + data = """ + @prefix dcat: . + @prefix dct: . + + a dcat:Dataset ; + dct:description "Una descripció qualsevol"@ca, + "Some description"@en, + "Una descripción cualquiera"@es ; + . + """ + g = Graph() + + g.parse(format='ttl', data=data) + + p = RDFProfile(g) + + description = p._object_value_multilingual( + URIRef("https://example.org/dataset/"), DCT.description + ) + assert description["en"] == "Some description" + assert description["ca"] == "Una descripció qualsevol" + assert description["es"] == "Una descripción cualquiera" + + def test_object_value_multilingual_missing_lang(self): + + data = """ + @prefix dcat: . + @prefix dct: . + + a dcat:Dataset ; + dct:description "Una descripció qualsevol"@ca, + "Some description"@en; + . + """ + g = Graph() + + g.parse(format='ttl', data=data) + + p = RDFProfile(g) + + p._form_languages = ["en", "ca", "es"] + + description = p._object_value_multilingual( + URIRef("https://example.org/dataset/"), DCT.description + ) + assert description["en"] == "Some description" + assert description["ca"] == "Una descripció qualsevol" + assert description["es"] == "" + + def test_object_value_multilingual_default_lang(self): + + data = """ + @prefix dcat: . + @prefix dct: . + + a dcat:Dataset ; + dct:description "Some description"; + . + """ + g = Graph() + + g.parse(format='ttl', data=data) + + p = RDFProfile(g) + + description = p._object_value_multilingual( + URIRef("https://example.org/dataset/"), DCT.description + ) + assert description["en"] == "Some description" + + @pytest.mark.ckan_config("ckan.locale_default", "ca") + def test_object_value_multilingual_default_lang_config(self): + + data = """ + @prefix dcat: . + @prefix dct: . + + a dcat:Dataset ; + dct:description "Some description"; + . + """ + g = Graph() + + g.parse(format='ttl', data=data) + + p = RDFProfile(g) + + description = p._object_value_multilingual( + URIRef("https://example.org/dataset/"), DCT.description + ) + assert description["ca"] == "Some description" + + def test_object_value_multilingual_rdfs_label(self): + + data = """ + @prefix dcat: . + @prefix dct: . + @prefix rdfs: . + + a dcat:Dataset ; + dct:provenance [ a dct:ProvenanceStatement ; + rdfs:label "Una declaració sobre la procedència"@ca, + "Statement about provenance"@en, + "Una declaración sobre la procedencia"@es ] ; + . + """ + g = Graph() + + g.parse(format='ttl', data=data) + + p = RDFProfile(g) + + provenance = p._object_value_multilingual( + URIRef("https://example.org/dataset/"), DCT.provenance + ) + assert provenance["en"] == "Statement about provenance" + assert provenance["ca"] == "Una declaració sobre la procedència" + assert provenance["es"] == "Una declaración sobre la procedencia" + + def test_object_value_multilingual_list(self): + + data = """ + @prefix dcat: . + @prefix dct: . + + a dcat:Dataset ; + dcat:keyword "Pins"@ca, + "Roures"@ca, + "Oaks"@en, + "Pines"@en, + "Pinos"@es, + "Robles"@es ; + . + """ + g = Graph() + + g.parse(format='ttl', data=data) + + p = RDFProfile(g) + + keywords = p._object_value_list_multilingual( + URIRef("https://example.org/dataset/"), DCAT.keyword + ) + assert sorted(keywords["en"]) == sorted(["Oaks", "Pines"]) + assert sorted(keywords["ca"]) == sorted(["Roures", "Pins"]) + assert sorted(keywords["es"]) == sorted(["Robles", "Pinos"]) + + def test_object_value_multilingual_list_missing_lang(self): + + data = """ + @prefix dcat: . + @prefix dct: . + + a dcat:Dataset ; + dcat:keyword "Pins"@ca, + "Roures"@ca, + "Oaks"@en, + "Pines"@en ; + . + """ + g = Graph() + + g.parse(format='ttl', data=data) + + p = RDFProfile(g) + + p._form_languages = ["en", "ca", "es"] + + keywords = p._object_value_list_multilingual( + URIRef("https://example.org/dataset/"), DCAT.keyword + ) + assert keywords["es"] == [] + + def test_object_value_multilingual_list_default_lang(self): + + data = """ + @prefix dcat: . + @prefix dct: . + + a dcat:Dataset ; + dcat:keyword "Oaks", + "Pines" ; + . + """ + g = Graph() + + g.parse(format='ttl', data=data) + + p = RDFProfile(g) + + keywords = p._object_value_list_multilingual( + URIRef("https://example.org/dataset/"), DCAT.keyword + ) + assert sorted(keywords["en"]) == sorted(["Oaks", "Pines"]) + + @pytest.mark.ckan_config("ckan.locale_default", "ca") + def test_object_value_multilingual_list_default_lang_conf(self): + + data = """ + @prefix dcat: . + @prefix dct: . + + a dcat:Dataset ; + dcat:keyword "Oaks", + "Pines" ; + . + """ + g = Graph() + + g.parse(format='ttl', data=data) + + p = RDFProfile(g) + + keywords = p._object_value_list_multilingual( + URIRef("https://example.org/dataset/"), DCAT.keyword + ) + assert sorted(keywords["ca"]) == sorted(["Oaks", "Pines"]) + def test_time_interval_schema_org(self): data = ''' diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py new file mode 100644 index 00000000..46db0c52 --- /dev/null +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_multilingual_support.py @@ -0,0 +1,254 @@ +import json + +import pytest + +from ckan.tests.helpers import call_action +from ckanext.dcat.processors import RDFSerializer, RDFParser +from ckanext.dcat.profiles import ( + DCAT, + DCATAP, + DCT, + ADMS, + VCARD, + FOAF, + SKOS, + LOCN, + GSP, + OWL, + SPDX, + RDFS, +) +from ckanext.dcat.tests.utils import BaseSerializeTest, BaseParseTest + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_multilingual.yaml" +) +@pytest.mark.ckan_config( + "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" +) +class TestSchemingFluentSerializeSupport(BaseSerializeTest): + def test_e2e_ckan_to_dcat(self): + """ + Create a dataset using the scheming fluent schema, check that fields + are exposed in the DCAT RDF graph with the approapiate language + """ + + dataset_dict = json.loads( + self._get_file_contents("ckan/ckan_dataset_multilingual.json") + ) + + dataset = call_action("package_create", **dataset_dict) + + # Make sure scheming and fluent was used + assert dataset["title_translated"]["en"] == "Test DCAT dataset" + assert dataset["version_notes"]["ca"] == "Notes sobre la versió" + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + assert self._triple( + g, dataset_ref, DCT.title, dataset["title_translated"]["en"], lang="en" + ) + assert self._triple( + g, dataset_ref, DCT.title, dataset["title_translated"]["ca"], lang="ca" + ) + assert self._triple( + g, dataset_ref, DCT.title, dataset["title_translated"]["es"], lang="es" + ) + + assert self._triple( + g, + dataset_ref, + DCT.description, + dataset["notes_translated"]["en"], + lang="en", + ) + assert self._triple( + g, + dataset_ref, + DCT.description, + dataset["notes_translated"]["ca"], + lang="ca", + ) + assert self._triple( + g, + dataset_ref, + DCT.description, + dataset["notes_translated"]["es"], + lang="es", + ) + + assert self._triple( + g, + dataset_ref, + ADMS.versionNotes, + dataset["version_notes"]["en"], + lang="en", + ) + assert self._triple( + g, + dataset_ref, + ADMS.versionNotes, + dataset["version_notes"]["ca"], + lang="ca", + ) + assert self._triple( + g, + dataset_ref, + ADMS.versionNotes, + dataset["version_notes"]["es"], + lang="es", + ) + + statement = [s for s in g.objects(dataset_ref, DCT.provenance)][0] + assert self._triple( + g, statement, RDFS.label, dataset["provenance"]["en"], lang="en" + ) + assert self._triple( + g, statement, RDFS.label, dataset["provenance"]["ca"], lang="ca" + ) + assert self._triple( + g, statement, RDFS.label, dataset["provenance"]["es"], lang="es" + ) + + assert len([t for t in g.triples((dataset_ref, DCAT.keyword, None))]) == 6 + for lang in dataset["tags_translated"]: + for tag in dataset["tags_translated"][lang]: + assert self._triple(g, dataset_ref, DCAT.keyword, tag, lang=lang) + + # Resource fields + + distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + resource = dataset_dict["resources"][0] + + assert self._triple( + g, distribution_ref, DCT.title, resource["name_translated"]["en"], lang="en" + ) + assert self._triple( + g, distribution_ref, DCT.title, resource["name_translated"]["ca"], lang="ca" + ) + assert self._triple( + g, distribution_ref, DCT.title, resource["name_translated"]["es"], lang="es" + ) + + assert self._triple( + g, + distribution_ref, + DCT.description, + resource["description_translated"]["en"], + lang="en", + ) + assert self._triple( + g, + distribution_ref, + DCT.description, + resource["description_translated"]["ca"], + lang="ca", + ) + assert self._triple( + g, + distribution_ref, + DCT.description, + resource["description_translated"]["es"], + lang="es", + ) + + statement = [s for s in g.objects(distribution_ref, DCT.rights)][0] + assert self._triple( + g, statement, RDFS.label, resource["rights"]["en"], lang="en" + ) + assert self._triple( + g, statement, RDFS.label, resource["rights"]["ca"], lang="ca" + ) + assert self._triple( + g, statement, RDFS.label, resource["rights"]["es"], lang="es" + ) + + # Check non translated fields for good measure + + assert self._triple(g, dataset_ref, OWL.versionInfo, dataset["version"]) + + contact_details = [t for t in g.triples((dataset_ref, DCAT.contactPoint, None))] + + assert len(contact_details) == len(dataset["contact"]) + assert self._triple( + g, contact_details[0][2], VCARD.fn, dataset_dict["contact"][0]["name"] + ) + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets fluent") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_multilingual.yaml" +) +@pytest.mark.ckan_config( + "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" +) +class TestSchemingFluentParseSupport(BaseParseTest): + def test_e2e_dcat_to_ckan(self): + """ + Parse a DCAT RDF graph into a CKAN dataset dict, create a dataset with + package_create and check that all the translated fields are there + """ + contents = self._get_file_contents("dcat/dataset_multilingual.ttl") + + p = RDFParser() + + p.parse(contents, _format="ttl") + + datasets = [d for d in p.datasets()] + + assert len(datasets) == 1 + + dataset_dict = datasets[0] + + dataset_dict["name"] = "test-dcat-1" + dataset = call_action("package_create", **dataset_dict) + + # Dataset core fields + assert dataset["title"] == "Test DCAT dataset" + assert dataset["title_translated"]["en"] == "Test DCAT dataset" + assert dataset["title_translated"]["ca"] == "Conjunt de dades de prova DCAT" + assert dataset["title_translated"]["es"] == "Conjunto de datos de prueba DCAT" + + assert dataset["notes"] == "Some description" + assert dataset["notes_translated"]["en"] == "Some description" + assert dataset["notes_translated"]["ca"] == "Una descripció qualsevol" + assert dataset["notes_translated"]["es"] == "Una descripción cualquiera" + + # Tags + assert sorted(dataset["tags_translated"]["en"]) == sorted(["Oaks", "Pines"]) + assert sorted(dataset["tags_translated"]["ca"]) == sorted(["Roures", "Pins"]) + assert sorted(dataset["tags_translated"]["es"]) == sorted(["Robles", "Pinos"]) + + # Dataset fields + assert dataset["provenance"]["en"] == "Statement about provenance" + assert dataset["provenance"]["ca"] == "Una declaració sobre la procedència" + assert dataset["provenance"]["es"] == "Una declaración sobre la procedencia" + + assert dataset["version_notes"]["en"] == "Some version notes" + assert dataset["version_notes"]["ca"] == "Notes sobre la versió" + assert dataset["version_notes"]["es"] == "Notas sobre la versión" + + resource = dataset["resources"][0] + + # Resource core fields + assert resource["name"] == "Resource 1" + assert resource["name_translated"]["en"] == "Resource 1" + assert resource["name_translated"]["ca"] == "Recurs 1" + assert resource["name_translated"]["es"] == "Recurso 1" + + assert resource["description"] == "Some description" + assert resource["description_translated"]["en"] == "Some description" + assert resource["description_translated"]["ca"] == "Una descripció qualsevol" + assert resource["description_translated"]["es"] == "Una descripción cualquiera" + + # Resource fields + assert resource["rights"]["en"] == "Some stament about rights" + assert resource["rights"]["ca"] == "Una nota sobre drets" + assert resource["rights"]["es"] == "Una nota sobre derechos" diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py index 383de651..20f87a2a 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py @@ -35,10 +35,6 @@ @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -269,9 +265,6 @@ def test_e2e_ckan_to_dcat(self): wkt_geom = wkt.dumps(dataset["spatial_coverage"][0]["geom"], decimals=4) assert self._triple(g, spatial[0][2], LOCN.Geometry, wkt_geom, GSP.wktLiteral) - distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] - resource = dataset_dict["resources"][0] - # Statements for item in [ ("access_rights", DCT.accessRights), @@ -280,6 +273,9 @@ def test_e2e_ckan_to_dcat(self): statement = [s for s in g.objects(dataset_ref, item[1])][0] assert self._triple(g, statement, RDFS.label, dataset[item[0]]) + distribution_ref = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + resource = dataset_dict["resources"][0] + # Resources: core fields assert self._triple(g, distribution_ref, DCT.title, resource["name"]) @@ -608,10 +604,6 @@ def test_dcat_date(self): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -640,10 +632,6 @@ def test_mimetype_is_guessed(self): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -975,10 +963,6 @@ def test_multiple_creators(self): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) diff --git a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_parse.py index e887a24d..bfeb784a 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_parse.py @@ -11,10 +11,6 @@ @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) @pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_dcat_ap_3") class TestSchemingParseSupport(BaseParseTest): def test_e2e_dcat_to_ckan(self): diff --git a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py index c3a82d33..9d323f5c 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py @@ -36,10 +36,6 @@ class TestEuroDCATAP3ProfileSerializeDataset(BaseSerializeTest): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) - @pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", - ) @pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_dcat_ap_3") def test_e2e_ckan_to_dcat(self): """ diff --git a/ckanext/dcat/tests/shacl/test_shacl.py b/ckanext/dcat/tests/shacl/test_shacl.py index 5455ae3e..132b4fe9 100644 --- a/ckanext/dcat/tests/shacl/test_shacl.py +++ b/ckanext/dcat/tests/shacl/test_shacl.py @@ -51,10 +51,6 @@ def _results_count(results_graph): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -75,10 +71,6 @@ def test_validate_dcat_ap_2_graph_shapes(): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -129,10 +121,6 @@ def test_validate_dcat_ap_2_legacy_graph_shapes_recommended(): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) @pytest.mark.ckan_config( "ckanext.dcat.rdf.profiles", "euro_dcat_ap_2 euro_dcat_ap_scheming" ) @@ -169,10 +157,6 @@ def test_validate_dcat_ap_2_graph_shapes_range(): @pytest.mark.ckan_config( "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" ) -@pytest.mark.ckan_config( - "scheming.presets", - "ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml", -) @pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_dcat_ap_3") def test_validate_dcat_ap_3_graph(): diff --git a/ckanext/dcat/tests/utils.py b/ckanext/dcat/tests/utils.py index 5a3d07cb..3da92248 100644 --- a/ckanext/dcat/tests/utils.py +++ b/ckanext/dcat/tests/utils.py @@ -29,20 +29,17 @@ def _extras(self, dataset): extras[extra["key"]] = extra["value"] return extras - def _triples(self, graph, subject, predicate, _object, data_type=None): + def _triples(self, graph, subject, predicate, _object, data_type=None, lang=None): if not ( isinstance(_object, URIRef) or isinstance(_object, BNode) or _object is None ): - if data_type: - _object = Literal(_object, datatype=data_type) - else: - _object = Literal(_object) + _object = Literal(_object, datatype=data_type, lang=lang) triples = [t for t in graph.triples((subject, predicate, _object))] return triples - def _triple(self, graph, subject, predicate, _object, data_type=None): - triples = self._triples(graph, subject, predicate, _object, data_type) + def _triple(self, graph, subject, predicate, _object, data_type=None, lang=None): + triples = self._triples(graph, subject, predicate, _object, data_type, lang) return triples[0] if triples else None def _triples_list_values(self, graph, subject, predicate): diff --git a/docs/getting-started.md b/docs/getting-started.md index 5310b4ad..86462e44 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -37,6 +37,8 @@ There are the following schemas currently included with the extension: * *dcat_ap_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the DCAT AP specification. You can use this schema with the `euro_dcat_ap_2` (+ `euro_dcat_ap_scheming`) and `euro_dcat_ap_3` profiles. * *dcat_ap_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT AP v2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) and [DCAT AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0/) specification. You can use this schema with the `euro_dcat_ap_2` (+ `euro_dcat_ap_scheming`) and `euro_dcat_ap_3` profiles. +* *dcat_ap_multilingual.yaml*: An example schema implementing multilingual metadata in some fields using [ckanext-fluent](https://github.com/ckan/ckanext-fluent). See [Multilingual support](profiles.md#multilingual-support) for more information. + Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](profiles.md#profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. diff --git a/docs/profiles.md b/docs/profiles.md index 37deae25..9813ff0f 100644 --- a/docs/profiles.md +++ b/docs/profiles.md @@ -115,7 +115,84 @@ Extensions define their available profiles using the `ckan.rdf.profiles` entrypo euro_dcat_ap_scheming=ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile schemaorg=ckanext.dcat.profiles:SchemaOrgProfile -## RDF DCAT Parser +## Multilingual support + +Support for parsing and serializing multilingual properties is provided by integrating with +[ckanext-fluent](https://github.com/ckan/ckanext-fluent), which provides a way to store multilingual +data in CKAN entities like datasets and resources. + +Multilingual fields need to use one of the fluent [presets](https://github.com/ckan/ckanext-fluent#fluent_text-fields) (like `fluent_text`, `fluent_markdown` or `fluent_tags`) in their schema, e.g.: + +```yaml +- field_name: provenance + preset: fluent_markdown + label: + en: Provenance + ca: Procedència + es: Procedencia +``` + +This will make CKAN store the values for the different languages separately. The parsers will +import properties from DCAT serializations in this format if the field is defined as fluent in +the schema: + +```json +{ + "name": "test-dataset", + "provenance": { + "en": "Statement about provenance", + "ca": "Una declaració sobre la procedència", + "es": "Una declaración sobre la procedencia" + } +} +``` + +!!! Note + If one of the languages is missing in the DCAT serialization, an empty string will be + returned for that language. Also if the DCAT serialization does not define the language + used, the default CKAN language will be used ([`ckan.locale_default`](https://docs.ckan.org/en/latest/maintaining/configuration.html#ckan-locale-default)). + + +Conversely, when serializing the CKAN dataset, a new triple will be added for each of the +defined languages (if the translation is present): + +```turtle +@prefix dcat: . +@prefix dct: . +@prefix foaf: . +@prefix owl: . +@prefix rdfs: . +@prefix vcard: . +@prefix xsd: . + + a dcat:Dataset ; + dct:title "Conjunt de dades de prova DCAT"@ca, + "Test DCAT dataset"@en, + "Conjunto de datos de prueba DCAT"@es ; + dct:description "Una descripció qualsevol"@ca, + "Some description"@en, + "Una descripción cualquiera"@es ; + dct:language "ca", + "en", + "es" ; + dct:provenance [ a dct:ProvenanceStatement ; + rdfs:label "Una declaració sobre la procedència"@ca, + "Statement about provenance"@en, + "Una declaración sobre la procedencia"@es ] ; +``` + +See [*examples/ckan/ckan_dataset_multilingual.json*](https://github.com/ckan/ckanext-dcat/blob/master/examples/ckan/ckan_dataset_multilingual.json) and [*examples/dcat/dataset_multilingual.ttl*](https://github.com/ckan/ckanext-dcat/blob/master/examples/dcat/dataset_multilingual.ttl) +for examples of a multilingual CKAN dataset and DCAT serialization. + + +Users [writing custom profiles](#writing-custom-profiles) can make use of the `_object_value_multilingual()` +and `_object_value_list_multilingual()` functions of the profile class to handle custom fields not defined +in the base profiles. + + +## Internals + +### RDF DCAT Parser The `ckanext.dcat.processors.RDFParser` class allows to read RDF serializations in different formats and extract CKAN dataset dicts. It will look for DCAT datasets and distributions @@ -164,7 +241,7 @@ The parser is implemented using [RDFLib](https://rdflib.readthedocs.org/), a Pyt RDF serialization format supported by RDFLib can be parsed into CKAN datasets. The `examples` folder contains serializations in different formats including RDF/XML, Turtle or JSON-LD. -## RDF DCAT Serializer +### RDF DCAT Serializer The `ckanext.dcat.processors.RDFSerializer` class generates RDF serializations in different formats from CKAN dataset dicts, like the ones returned by [`package_show`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_show) or [`package_search`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_search). @@ -233,10 +310,6 @@ the following values will be used for `dct:accrualPeriodicity`: Once the dataset graph has been obtained, this is serialized into a text format using [RDFLib](https://rdflib.readthedocs.org/), so any format it supports can be obtained (common formats are 'xml', 'turtle' or 'json-ld'). -### Inherit license from the dataset as fallback in distributions -It is possible to inherit the license from the dataset to the distributions, but only if there is no license defined in the resource yet. By default the license is not inherited from the dataset. This can be activated by setting the following parameter in the CKAN config file: - - ckanext.dcat.resource.inherit.license = True diff --git a/examples/ckan/ckan_dataset_multilingual.json b/examples/ckan/ckan_dataset_multilingual.json new file mode 100644 index 00000000..a173524d --- /dev/null +++ b/examples/ckan/ckan_dataset_multilingual.json @@ -0,0 +1,94 @@ +{ + "name": "test-dataset-multilingual", + "title": "Test DCAT dataset", + "title_translated": { + "en": "Test DCAT dataset", + "ca": "Conjunt de dades de prova DCAT", + "es": "Conjunto de datos de prueba DCAT" + }, + "notes": "Some description", + "notes_translated": { + "en": "Some description", + "ca": "Una descripció qualsevol", + "es": "Una descripción cualquiera" + }, + "url": "http://example.org/ds1", + "tags": [ + { + "name": "Oaks" + }, + { + "name": "Pines" + } + ], + "tags_translated": { + "en": ["Oaks", "Pines"], + "ca": ["Roures", "Pins"], + "es": ["Robles", "Pinos"] + }, + "issued": "2024-05-01", + "modified": "2024-05-05", + "provenance": { + "en": "Statement about provenance", + "ca": "Una declaració sobre la procedència", + "es": "Una declaración sobre la procedencia" + }, + "version": "1.0b", + "version_notes":{ + "en": "Some version notes", + "ca": "Notes sobre la versió", + "es": "Notas sobre la versión" + }, + "language": [ + "en", + "ca", + "es" + ], + "documentation": [ + "https://example.org/some-doc.html" + ], + "contact": [ + { + "name": "Contact 1", + "email": "contact1@example.org", + "identifier": "123" + }, + { + "name": "Contact 2", + "email": "contact2@example.org", + "identifier": "456" + } + ], + "resources": [ + { + "name": "Resource 1", + "name_translated": { + "en": "Resource 1", + "ca": "Recurs 1", + "es": "Recurso 1" + }, + "description": "Some description", + "description_translated": { + "en": "Some description", + "ca": "Una descripció qualsevol", + "es": "Una descripción cualquiera" + }, + "url": "https://example.com/data.csv", + "format": "CSV", + "access_url": "https://example.com/data.csv", + "download_url": "https://example.com/data.csv", + "issued": "2024-05-01T01:20:33", + "modified": "2024-05-05T09:33:20", + "rights": { + "en": "Some stament about rights", + "ca": "Una nota sobre drets", + "es": "Una nota sobre derechos" + }, + "language": [ + "en", + "ca", + "es" + ] + } + ] +} diff --git a/examples/dcat/dataset_multilingual.ttl b/examples/dcat/dataset_multilingual.ttl new file mode 100644 index 00000000..6dd24bca --- /dev/null +++ b/examples/dcat/dataset_multilingual.ttl @@ -0,0 +1,74 @@ +@prefix adms: . +@prefix dcat: . +@prefix dct: . +@prefix foaf: . +@prefix owl: . +@prefix rdfs: . +@prefix vcard: . +@prefix xsd: . + + a dcat:Dataset ; + dct:description "Una descripció qualsevol"@ca, + "Some description"@en, + "Una descripción cualquiera"@es ; + dct:identifier "0112cf32-bce0-4071-9504-923375f9f2ad" ; + dct:issued "2024-05-01"^^xsd:date ; + dct:language "ca", + "en", + "es" ; + dct:modified "2024-05-05"^^xsd:date ; + dct:provenance [ a dct:ProvenanceStatement ; + rdfs:label "Una declaració sobre la procedència"@ca, + "Statement about provenance"@en, + "Una declaración sobre la procedencia"@es ] ; + dct:title "Conjunt de dades de prova DCAT"@ca, + "Test DCAT dataset"@en, + "Conjunto de datos de prueba DCAT"@es ; + owl:versionInfo "1.0b" ; + adms:versionNotes "Notes sobre la versió"@ca, + "Some version notes"@en, + "Notas sobre la versión"@es ; + dcat:contactPoint [ a vcard:Kind ; + vcard:fn "Contact 2" ; + vcard:hasEmail ; + vcard:hasUID "456" ], + [ a vcard:Kind ; + vcard:fn "Contact 1" ; + vcard:hasEmail ; + vcard:hasUID "123" ] ; + dcat:distribution ; + dcat:keyword "Pins"@ca, + "Roures"@ca, + "Oaks"@en, + "Pines"@en, + "Pinos"@es, + "Robles"@es ; + dcat:landingPage ; + foaf:page . + + a foaf:Document . + + a dcat:Distribution ; + dct:description "Una descripció qualsevol"@ca, + "Some description"@en, + "Una descripción cualquiera"@es ; + dct:format "CSV" ; + dct:issued "2024-05-01T01:20:33"^^xsd:dateTime ; + dct:language "ca", + "en", + "es" ; + dct:modified "2024-05-05T09:33:20"^^xsd:dateTime ; + dct:rights [ a dct:RightsStatement ; + rdfs:label "Una nota sobre drets"@ca, + "Some stament about rights"@en, + "Una nota sobre derechos"@es ] ; + dct:title "Recurs 1"@ca, + "Resource 1"@en, + "Recurso 1"@es ; + dcat:accessURL ; + dcat:downloadURL . + + a foaf:Document . + + a rdfs:Resource . + diff --git a/test.ini b/test.ini index 529eebc4..afab9137 100644 --- a/test.ini +++ b/test.ini @@ -17,6 +17,10 @@ ckanext.dcat.enable_content_negotiation=True ckan.activity_streams_enabled = false ckan.harvest.mq.type = redis +# Needed here because of https://github.com/ckan/ckanext-scheming/issues/424 +scheming.presets = ckanext.scheming:presets.json ckanext.dcat.schemas:presets.yaml ckanext.fluent:presets.json + + # Logging configuration [loggers] keys = root, ckan, sqlalchemy