diff --git a/README.md b/README.md index 1b6ce2b8..c716de04 100644 --- a/README.md +++ b/README.md @@ -280,10 +280,13 @@ When using a plain string translations will be provided with gettext: This field is the parent of group of repeating subfields. The value is a list of fields entered the same way as normal fields. -> **_NOTE:_** CKAN needs an IPackageController plugin with `before_index` to -> convert repeating subfields to formats that can be indexed by solr. For -> testing you may use the included `scheming_nerf_index` plugin to encode -> all repeating fields as JSON strings to prevent solr errors. +> [!NOTE] +> CKAN needs an IPackageController plugin with `before_dataset_index` to +> convert repeating subfields to formats that can be indexed by solr. The +> included `scheming_subfields_index` plugin will group the values of the +> same subfields in a text field that will make the values findable. If +> you require more precise handling of a particular subfield, +> you will need to customize the Solr schema to add the necessary fields. `repeating_label` may be used to provide a singular version of the label for each group. diff --git a/ckanext/scheming/plugins.py b/ckanext/scheming/plugins.py index 0980a684..27344613 100644 --- a/ckanext/scheming/plugins.py +++ b/ckanext/scheming/plugins.py @@ -499,6 +499,53 @@ def before_index(self, data_dict): return data_dict +class SchemingSubfieldsIndexPlugin(p.SingletonPlugin): + """ + Index suitable repeating dataset fields in before_dataset_index to prevent failures + on unmodified solr schema. This will allow hitting results in most text and list + subfields. Ideally you probably want to select the relevant subfields that will get + indexed and modify the Solr schema if necessary. + + This implementation will group the values of the same subfields into an + `extras_{field_name}__{key}`,a text Solr field that will allow free-text search on + its value. Again, if you require more precise handling of a particular subfield, + you will need to customize the Solr schema to add particular fields needed. + """ + p.implements(p.IPackageController, inherit=True) + + def before_dataset_index(self, data_dict): + return self.before_index(data_dict) + + def before_index(self, data_dict): + schemas = SchemingDatasetsPlugin.instance._expanded_schemas + if data_dict['type'] not in schemas: + return data_dict + + schema = schemas[data_dict['type']] + + for field in schema['dataset_fields']: + if field['field_name'] in data_dict and 'repeating_subfields' in field: + for item in data_dict[field['field_name']]: + for key in item: + value = item[key] + if isinstance(value, dict): + continue + if isinstance(value, list): + value = ' '.join(value) + # Index a flattened version + new_key = 'extras_{field_name}__{key}'.format( + field_name=field["field_name"], key=key + ) + if not data_dict.get(new_key): + data_dict[new_key] = value + else: + data_dict[new_key] += ' ' + value + + data_dict.pop(field['field_name'], None) + + return data_dict + + def _load_schemas(schemas, type_field): out = {} for n in schemas: diff --git a/ckanext/scheming/tests/test_subfields.py b/ckanext/scheming/tests/test_subfields.py new file mode 100644 index 00000000..d23d27e0 --- /dev/null +++ b/ckanext/scheming/tests/test_subfields.py @@ -0,0 +1,45 @@ +try: + from unittest import mock +except ImportError: + import mock + +import pytest +import ckantoolkit + +from ckantoolkit.tests.factories import Dataset +from ckantoolkit.tests.helpers import call_action + + +dataset_dict = { + "name": "test-dataset", + "type": "test-subfields", + # Repeating subfields + "contact_address": [ + {"address": "Maple Street 123", "city": "New Paris", "country": "Maplonia"}, + {"address": "Rose Avenue 452", "city": "Old York", "country": "Rosestan"}, + ], +} + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "scheming_datasets scheming_subfields_index") +def test_repeating_subfields_index(): + + with mock.patch("ckan.lib.search.index.make_connection") as m: + call_action("package_create", **dataset_dict) + + # Dict sent to Solr + search_dict = m.mock_calls[1].kwargs["docs"][0] + assert search_dict["extras_contact_address__city"] == "New Paris Old York" + assert search_dict["extras_contact_address__country"] == "Maplonia Rosestan" + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "scheming_datasets scheming_subfields_index") +def test_repeating_subfields_search(): + + dataset = call_action("package_create", **dataset_dict) + + result = call_action("package_search", q="Old York") + + assert result["results"][0]["id"] == dataset["id"] diff --git a/ckanext/scheming/tests/test_validation.py b/ckanext/scheming/tests/test_validation.py index 590c43d3..fcb5a9da 100644 --- a/ckanext/scheming/tests/test_validation.py +++ b/ckanext/scheming/tests/test_validation.py @@ -21,6 +21,22 @@ not_empty = get_validator("not_empty") +pytestmark = [ + pytest.mark.usefixtures("with_plugins"), + pytest.mark.ckan_config( + "ckan.plugins", + " ".join([ + "scheming_datasets", + "scheming_groups", + "scheming_organizations", + "scheming_test_plugin", + "scheming_subfields_index", + "scheming_test_validation", + ]) + ) +] + + class TestGetValidatorOrConverter(object): def test_missing(self): with pytest.raises(SchemingException): @@ -941,8 +957,6 @@ def test_invalid_choice(self): raise AssertionError("ValidationError not raised") -@pytest.mark.ckan_config("ckan.plugins", "scheming_test_validation") -@pytest.mark.usefixtures("with_plugins") class TestValidatorsFromString: def test_empty(self): assert validators_from_string("", {}, {}) == [] diff --git a/setup.py b/setup.py index 80b58012..daed8972 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ scheming_groups=ckanext.scheming.plugins:SchemingGroupsPlugin scheming_organizations=ckanext.scheming.plugins:SchemingOrganizationsPlugin scheming_nerf_index=ckanext.scheming.plugins:SchemingNerfIndexPlugin + scheming_subfields_index=ckanext.scheming.plugins:SchemingSubfieldsIndexPlugin scheming_test_subclass=ckanext.scheming.tests.plugins:SchemingTestSubclass scheming_test_plugin=ckanext.scheming.tests.plugins:SchemingTestSchemaPlugin scheming_test_validation=ckanext.scheming.tests.plugins:SchemingTestValidationPlugin diff --git a/test.ini b/test.ini index 1dc6a6d3..23198180 100644 --- a/test.ini +++ b/test.ini @@ -12,7 +12,7 @@ port = 5000 use = config:../../src/ckan/test-core.ini ckan.plugins = scheming_datasets scheming_groups scheming_organizations - scheming_test_plugin scheming_nerf_index + scheming_test_plugin scheming_subfields_index scheming.dataset_schemas = ckanext.scheming:ckan_dataset.yaml ckanext.scheming.tests:test_schema.json ckanext.scheming.tests:test_subfields.yaml