From 00771bb98f2236686f827917438a9d41419f19fa Mon Sep 17 00:00:00 2001 From: nospame Date: Tue, 11 Feb 2025 16:41:54 -0800 Subject: [PATCH 1/7] Get export transform functions with importlib via util --- corehq/apps/export/const.py | 36 ++++++++++---------------------- corehq/apps/export/models/new.py | 12 ++++++++--- corehq/apps/export/utils.py | 13 +++++++++++- corehq/apps/hqcase/utils.py | 6 ++++-- 4 files changed, 36 insertions(+), 31 deletions(-) diff --git a/corehq/apps/export/const.py b/corehq/apps/export/const.py index 6becd655542f..0feb81f86366 100644 --- a/corehq/apps/export/const.py +++ b/corehq/apps/export/const.py @@ -2,20 +2,6 @@ Some of these constants correspond to constants set in corehq/apps/export/static/export/js/const.js so if changing a value, ensure that both places reflect the change """ -from couchexport.deid import deid_date, deid_ID - -from corehq.apps.export.transforms import ( - case_close_to_boolean, - case_id_to_case_name, - case_id_to_link, - case_or_user_id_to_name, - doc_type_transform, - form_id_to_link, - owner_id_to_display, - user_id_to_username, - workflow_transform, -) - # When fixing a bug that requires existing schemas to be rebuilt, # bump the version number. FORM_DATA_SCHEMA_VERSION = 10 @@ -25,8 +11,8 @@ DEID_ID_TRANSFORM = "deid_id" DEID_DATE_TRANSFORM = "deid_date" DEID_TRANSFORM_FUNCTIONS = { - DEID_ID_TRANSFORM: deid_ID, - DEID_DATE_TRANSFORM: deid_date, + DEID_DATE_TRANSFORM: 'deid_date', + DEID_ID_TRANSFORM: 'deid_ID', } CASE_NAME_TRANSFORM = "case_name_transform" CASE_ID_TO_LINK = "case_link_transform" @@ -38,15 +24,15 @@ CASE_OR_USER_ID_TRANSFORM = "case_or_user_id_transform" CASE_CLOSE_TO_BOOLEAN = "case_close_to_boolean" TRANSFORM_FUNCTIONS = { - CASE_NAME_TRANSFORM: case_id_to_case_name, - CASE_ID_TO_LINK: case_id_to_link, - FORM_ID_TO_LINK: form_id_to_link, - USERNAME_TRANSFORM: user_id_to_username, - OWNER_ID_TRANSFORM: owner_id_to_display, - WORKFLOW_TRANSFORM: workflow_transform, - DOC_TYPE_TRANSFORM: doc_type_transform, - CASE_OR_USER_ID_TRANSFORM: case_or_user_id_to_name, - CASE_CLOSE_TO_BOOLEAN: case_close_to_boolean, + CASE_NAME_TRANSFORM: 'case_id_to_case_name', + CASE_ID_TO_LINK: 'case_id_to_link', + FORM_ID_TO_LINK: 'form_id_to_link', + USERNAME_TRANSFORM: 'user_id_to_username', + OWNER_ID_TRANSFORM: 'owner_id_to_display', + WORKFLOW_TRANSFORM: 'workflow_transform', + DOC_TYPE_TRANSFORM: 'doc_type_transform', + CASE_OR_USER_ID_TRANSFORM: 'case_or_user_id_to_name', + CASE_CLOSE_TO_BOOLEAN: 'case_close_to_boolean', } PLAIN_USER_DEFINED_SPLIT_TYPE = 'plain' MULTISELCT_USER_DEFINED_SPLIT_TYPE = 'multi-select' diff --git a/corehq/apps/export/models/new.py b/corehq/apps/export/models/new.py index dd94f03cd70c..ecff324f7c06 100644 --- a/corehq/apps/export/models/new.py +++ b/corehq/apps/export/models/new.py @@ -99,7 +99,11 @@ get_form_export_base_query, get_sms_export_base_query, ) -from corehq.apps.export.utils import is_occurrence_deleted +from corehq.apps.export.utils import ( + get_deid_transform_function, + get_transform_function, + is_occurrence_deleted, +) from corehq.apps.locations.models import SQLLocation from corehq.apps.products.models import SQLProduct from corehq.apps.reports.daterange import get_daterange_start_end_dates @@ -322,10 +326,12 @@ def _transform(self, value, doc, transform_dates): if transform_dates: value = couch_to_excel_datetime(value, doc) if self.item.transform: - value = TRANSFORM_FUNCTIONS[self.item.transform](value, doc) + transform_function = get_transform_function(TRANSFORM_FUNCTIONS[self.item.transform]) + value = transform_function(value, doc) if self.deid_transform: try: - value = DEID_TRANSFORM_FUNCTIONS[self.deid_transform](value, doc) + transform_function = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[self.deid_transform]) + value = transform_function(value, doc) except ValueError: # Unable to convert the string to a date pass diff --git a/corehq/apps/export/utils.py b/corehq/apps/export/utils.py index 5e638f4944d2..b51f2fdd7a07 100644 --- a/corehq/apps/export/utils.py +++ b/corehq/apps/export/utils.py @@ -1,8 +1,8 @@ +import importlib from django.http import Http404 from couchdbkit import ResourceNotFound -from corehq.apps.accounting.models import Subscription from corehq.apps.accounting.utils import domain_has_privilege from corehq.privileges import DAILY_SAVED_EXPORT, DEFAULT_EXPORT_SETTINGS, EXCEL_DASHBOARD from corehq.toggles import MESSAGE_LOG_METADATA @@ -57,6 +57,7 @@ def get_default_export_settings_if_available(domain): """ Only creates settings if the domain has the DEFAULT_EXPORT_SETTINGS privilege """ + from corehq.apps.accounting.models import Subscription settings = None current_subscription = Subscription.get_active_subscription_by_domain(domain) if current_subscription and domain_has_privilege(domain, DEFAULT_EXPORT_SETTINGS): @@ -64,3 +65,13 @@ def get_default_export_settings_if_available(domain): settings = DefaultExportSettings.objects.get_or_create(account=current_subscription.account)[0] return settings + + +def get_transform_function(func_name): + module = importlib.import_module('corehq.apps.export.transforms') + return getattr(module, func_name) + + +def get_deid_transform_function(func_name): + module = importlib.import_module('couchexport.deid') + return getattr(module, func_name) diff --git a/corehq/apps/hqcase/utils.py b/corehq/apps/hqcase/utils.py index 9ad8ac3208de..e0579560002a 100644 --- a/corehq/apps/hqcase/utils.py +++ b/corehq/apps/hqcase/utils.py @@ -7,7 +7,6 @@ from casexml.apps.case.mock import CaseBlock from casexml.apps.case.util import property_changed_in_action -from couchexport.deid import deid_date, deid_ID from dimagi.utils.parsing import json_format_datetime from corehq.apps.case_search.const import INDEXED_METADATA_BY_KEY @@ -15,6 +14,7 @@ from corehq.apps.es import filters from corehq.apps.es.cases import CaseES from corehq.apps.export.const import DEID_DATE_TRANSFORM, DEID_ID_TRANSFORM +from corehq.apps.export.utils import get_deid_transform_function from corehq.apps.receiverwrapper.util import submit_form_locally from corehq.apps.users.util import SYSTEM_USER_ID from corehq.form_processor.exceptions import CaseNotFound, MissingFormXml @@ -293,9 +293,11 @@ def get_deidentified_data(case, censor_data): censored_value = '' if transform == DEID_DATE_TRANSFORM: + deid_date = get_deid_transform_function(DEID_DATE_TRANSFORM) censored_value = deid_date(case_value, None, key=case.case_id) if transform == DEID_ID_TRANSFORM: - censored_value = deid_ID(case_value, None) + deid_id = get_deid_transform_function(DEID_ID_TRANSFORM) + censored_value = deid_id(case_value, None) if is_case_property: props[attr_or_prop] = censored_value From fa24fd6477b2b60d4d6e9d099b5348ac80c06b99 Mon Sep 17 00:00:00 2001 From: nospame Date: Wed, 12 Feb 2025 14:50:55 -0800 Subject: [PATCH 2/7] Add DeIdMapping --- .../export/migrations/0014_deidmapping.py | 39 +++++++++++++++++ corehq/apps/export/models/__init__.py | 4 ++ corehq/apps/export/models/deid_export.py | 42 +++++++++++++++++++ migrations.lock | 1 + 4 files changed, 86 insertions(+) create mode 100644 corehq/apps/export/migrations/0014_deidmapping.py create mode 100644 corehq/apps/export/models/deid_export.py diff --git a/corehq/apps/export/migrations/0014_deidmapping.py b/corehq/apps/export/migrations/0014_deidmapping.py new file mode 100644 index 000000000000..635b2d4a059f --- /dev/null +++ b/corehq/apps/export/migrations/0014_deidmapping.py @@ -0,0 +1,39 @@ +# Generated by Django 4.2.17 on 2025-02-12 00:46 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ("export", "0013_rm_incrementalexport"), + ] + + operations = [ + migrations.CreateModel( + name="DeIdMapping", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("domain", models.TextField(max_length=255)), + ("hashed_value", models.TextField(max_length=32)), + ("deid", models.UUIDField(default=uuid.uuid4)), + ], + options={ + "indexes": [ + models.Index( + fields=["domain", "hashed_value"], + name="export_deid_domain_3c63a4_idx", + ) + ], + }, + ), + ] diff --git a/corehq/apps/export/models/__init__.py b/corehq/apps/export/models/__init__.py index bef7b31fa23b..7b3a351702e9 100644 --- a/corehq/apps/export/models/__init__.py +++ b/corehq/apps/export/models/__init__.py @@ -45,6 +45,10 @@ DataSourceExportInstance, ) +from .deid_export import ( + DeIdMapping, +) + from .export_settings import ( DefaultExportSettings, ) diff --git a/corehq/apps/export/models/deid_export.py b/corehq/apps/export/models/deid_export.py new file mode 100644 index 000000000000..3201fefbb8e8 --- /dev/null +++ b/corehq/apps/export/models/deid_export.py @@ -0,0 +1,42 @@ +import hashlib +import uuid + +from django.db import models + +from corehq.const import ONE_DAY +from corehq.util.quickcache import quickcache + + +class DeIdMapping(models.Model): + domain = models.TextField(max_length=255) + hashed_value = models.TextField(max_length=32) + deid = models.UUIDField(default=uuid.uuid4) + + class Meta: + indexes = [ + models.Index(fields=['domain', 'hashed_value']), + ] + + @classmethod + def get_deid(cls, value, doc, domain=None): + if doc is not None: + # use domain from the couch doc if one was passed in + domain = doc['domain'] + + return cls._get_deid(value, domain) + + @classmethod + @quickcache(['value', 'domain'], timeout=90 * ONE_DAY) + def _get_deid(cls, value, domain): + hashed_value = cls._hash_value(value) + deid_mapping, __ = cls.objects.get_or_create(domain=domain, hashed_value=hashed_value) + return deid_mapping.deid + + @staticmethod + @quickcache(['value'], timeout=90 * ONE_DAY) + def _hash_value(value): + if value is None: + # None is a de-identifiable value but needs a string to encode for lookup + value = '' + + return hashlib.md5(value.encode('utf-8')).hexdigest() diff --git a/migrations.lock b/migrations.lock index a3bf57ba6f42..7cec7c58dc2a 100644 --- a/migrations.lock +++ b/migrations.lock @@ -465,6 +465,7 @@ export 0011_defaultexportsettings_usecouchfiletypes 0012_defaultexportsettings_remove_duplicates_option 0013_rm_incrementalexport + 0014_deidmapping fhir 0001_initial 0002_fhirresourcetype From 2f2b1624dd8365f33797eef756add0a18799f447 Mon Sep 17 00:00:00 2001 From: nospame Date: Wed, 12 Feb 2025 15:50:46 -0800 Subject: [PATCH 3/7] Point get_deid_transform_function to DeIdMapping --- corehq/apps/export/utils.py | 11 +++++++++-- corehq/apps/hqcase/utils.py | 12 ++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/corehq/apps/export/utils.py b/corehq/apps/export/utils.py index b51f2fdd7a07..e77a9c54c5c8 100644 --- a/corehq/apps/export/utils.py +++ b/corehq/apps/export/utils.py @@ -4,6 +4,9 @@ from couchdbkit import ResourceNotFound from corehq.apps.accounting.utils import domain_has_privilege +from corehq.apps.export.const import ( + DEID_DATE_TRANSFORM, DEID_ID_TRANSFORM, DEID_TRANSFORM_FUNCTIONS +) from corehq.privileges import DAILY_SAVED_EXPORT, DEFAULT_EXPORT_SETTINGS, EXCEL_DASHBOARD from corehq.toggles import MESSAGE_LOG_METADATA @@ -73,5 +76,9 @@ def get_transform_function(func_name): def get_deid_transform_function(func_name): - module = importlib.import_module('couchexport.deid') - return getattr(module, func_name) + if func_name == DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM]: + module = importlib.import_module('couchexport.deid') + return getattr(module, func_name) + elif func_name == DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM]: + from corehq.apps.export.models import DeIdMapping + return DeIdMapping.get_deid diff --git a/corehq/apps/hqcase/utils.py b/corehq/apps/hqcase/utils.py index e0579560002a..b2aee8bce920 100644 --- a/corehq/apps/hqcase/utils.py +++ b/corehq/apps/hqcase/utils.py @@ -13,7 +13,11 @@ from corehq.apps.data_interfaces.deduplication import DEDUPE_XMLNS from corehq.apps.es import filters from corehq.apps.es.cases import CaseES -from corehq.apps.export.const import DEID_DATE_TRANSFORM, DEID_ID_TRANSFORM +from corehq.apps.export.const import ( + DEID_DATE_TRANSFORM, + DEID_ID_TRANSFORM, + DEID_TRANSFORM_FUNCTIONS, +) from corehq.apps.export.utils import get_deid_transform_function from corehq.apps.receiverwrapper.util import submit_form_locally from corehq.apps.users.util import SYSTEM_USER_ID @@ -293,11 +297,11 @@ def get_deidentified_data(case, censor_data): censored_value = '' if transform == DEID_DATE_TRANSFORM: - deid_date = get_deid_transform_function(DEID_DATE_TRANSFORM) + deid_date = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM]) censored_value = deid_date(case_value, None, key=case.case_id) if transform == DEID_ID_TRANSFORM: - deid_id = get_deid_transform_function(DEID_ID_TRANSFORM) - censored_value = deid_id(case_value, None) + deid_id = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM]) + censored_value = deid_id(case_value, None, domain=case.domain) if is_case_property: props[attr_or_prop] = censored_value From 5a9f74022f79ca7fcdf434ca02dc309eaad4bb38 Mon Sep 17 00:00:00 2001 From: nospame Date: Wed, 12 Feb 2025 15:52:36 -0800 Subject: [PATCH 4/7] Add TestDeIdMapping --- corehq/apps/export/tests/test_deid_export.py | 33 ++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 corehq/apps/export/tests/test_deid_export.py diff --git a/corehq/apps/export/tests/test_deid_export.py b/corehq/apps/export/tests/test_deid_export.py new file mode 100644 index 000000000000..c89f5a9f67ee --- /dev/null +++ b/corehq/apps/export/tests/test_deid_export.py @@ -0,0 +1,33 @@ +from django.test import TestCase + +from corehq.apps.export.models import DeIdMapping + + +class TestDeIdMapping(TestCase): + def test_deid_unique_by_domain(self): + value = 'somedatavalue' + + deid_one = DeIdMapping.get_deid(value, None, domain='test-domain-1') + deid_two = DeIdMapping.get_deid(value, None, domain='test-domain-2') + self.assertNotEqual(deid_one, deid_two) + + def test_deid_consistent_for_value_and_domain(self): + value = 'somedatavalue' + domain = 'test-domain' + + deid_one = DeIdMapping.get_deid(value, None, domain=domain) + deid_two = DeIdMapping.get_deid(value, None, domain=domain) + self.assertEqual(deid_one, deid_two) + + def test_none_is_a_deidentifiable_value(self): + value = None + + deid = DeIdMapping.get_deid(value, None, domain='test-domain') + self.assertIsNotNone(deid) + + def test_uses_domain_from_doc(self): + doc = {'domain': 'doc-test-domain'} + + deid = DeIdMapping.get_deid('somedatavalue', doc) + deid_mapping = DeIdMapping.objects.get(deid=deid) + self.assertEqual(deid_mapping.domain, doc['domain']) From 5e72079cb605cc9bb9a6c3bada984621d3358715 Mon Sep 17 00:00:00 2001 From: nospame Date: Thu, 13 Feb 2025 12:02:20 -0800 Subject: [PATCH 5/7] Return a string from get_deid Ensures it can be transformed to xml --- corehq/apps/export/models/deid_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corehq/apps/export/models/deid_export.py b/corehq/apps/export/models/deid_export.py index 3201fefbb8e8..8380163fbec6 100644 --- a/corehq/apps/export/models/deid_export.py +++ b/corehq/apps/export/models/deid_export.py @@ -30,7 +30,7 @@ def get_deid(cls, value, doc, domain=None): def _get_deid(cls, value, domain): hashed_value = cls._hash_value(value) deid_mapping, __ = cls.objects.get_or_create(domain=domain, hashed_value=hashed_value) - return deid_mapping.deid + return str(deid_mapping.deid) @staticmethod @quickcache(['value'], timeout=90 * ONE_DAY) From 0d322dbb986b2c297bc66646463556f01a1e6633 Mon Sep 17 00:00:00 2001 From: nospame Date: Thu, 13 Feb 2025 13:48:34 -0800 Subject: [PATCH 6/7] Add DeIdMapping to domain dump and deletion models --- corehq/apps/domain/deletion.py | 1 + corehq/apps/dump_reload/sql/dump.py | 1 + 2 files changed, 2 insertions(+) diff --git a/corehq/apps/domain/deletion.py b/corehq/apps/domain/deletion.py index 9294e24fda66..f26c3280a74d 100644 --- a/corehq/apps/domain/deletion.py +++ b/corehq/apps/domain/deletion.py @@ -420,6 +420,7 @@ def _delete_demo_user_restores(domain_name): PartitionedModelDeletion('scheduling_partitioned', 'CaseTimedScheduleInstance', 'domain'), PartitionedModelDeletion('scheduling_partitioned', 'TimedScheduleInstance', 'domain'), ModelDeletion('domain', 'TransferDomainRequest', 'domain'), + ModelDeletion('export', 'DeIdMapping', 'domain'), ModelDeletion('export', 'EmailExportWhenDoneRequest', 'domain'), ModelDeletion('export', 'LedgerSectionEntry', 'domain'), CustomDeletion('export', _delete_data_files, []), diff --git a/corehq/apps/dump_reload/sql/dump.py b/corehq/apps/dump_reload/sql/dump.py index 7f949ee836b4..d1b16e298fe9 100644 --- a/corehq/apps/dump_reload/sql/dump.py +++ b/corehq/apps/dump_reload/sql/dump.py @@ -258,6 +258,7 @@ FilteredModelIteratorBuilder('email.EmailSettings', SimpleFilter('domain')), FilteredModelIteratorBuilder('dhis2.SQLDataSetMap', SimpleFilter('domain')), FilteredModelIteratorBuilder('dhis2.SQLDataValueMap', SimpleFilter('dataset_map__domain')), + FilteredModelIteratorBuilder('export.DeIdMapping', SimpleFilter('domain')), ]] From 29a676a0962cde43bb0649fe24e9de2d93e07e89 Mon Sep 17 00:00:00 2001 From: nospame Date: Fri, 14 Feb 2025 08:31:01 -0800 Subject: [PATCH 7/7] Always get domain from doc or case for DeIdMapping --- corehq/apps/export/models/deid_export.py | 8 ++++---- corehq/apps/export/tests/test_deid_export.py | 17 +++++------------ corehq/apps/hqcase/utils.py | 2 +- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/corehq/apps/export/models/deid_export.py b/corehq/apps/export/models/deid_export.py index 8380163fbec6..73b77951cfca 100644 --- a/corehq/apps/export/models/deid_export.py +++ b/corehq/apps/export/models/deid_export.py @@ -18,11 +18,11 @@ class Meta: ] @classmethod - def get_deid(cls, value, doc, domain=None): - if doc is not None: - # use domain from the couch doc if one was passed in + def get_deid(cls, value, doc): + if isinstance(doc, dict): domain = doc['domain'] - + else: + domain = doc.domain return cls._get_deid(value, domain) @classmethod diff --git a/corehq/apps/export/tests/test_deid_export.py b/corehq/apps/export/tests/test_deid_export.py index c89f5a9f67ee..1198db6f195e 100644 --- a/corehq/apps/export/tests/test_deid_export.py +++ b/corehq/apps/export/tests/test_deid_export.py @@ -7,27 +7,20 @@ class TestDeIdMapping(TestCase): def test_deid_unique_by_domain(self): value = 'somedatavalue' - deid_one = DeIdMapping.get_deid(value, None, domain='test-domain-1') - deid_two = DeIdMapping.get_deid(value, None, domain='test-domain-2') + deid_one = DeIdMapping.get_deid(value, {'domain': 'test-domain-1'}) + deid_two = DeIdMapping.get_deid(value, {'domain': 'test-domain-2'}) self.assertNotEqual(deid_one, deid_two) def test_deid_consistent_for_value_and_domain(self): value = 'somedatavalue' domain = 'test-domain' - deid_one = DeIdMapping.get_deid(value, None, domain=domain) - deid_two = DeIdMapping.get_deid(value, None, domain=domain) + deid_one = DeIdMapping.get_deid(value, {'domain': domain}) + deid_two = DeIdMapping.get_deid(value, {'domain': domain}) self.assertEqual(deid_one, deid_two) def test_none_is_a_deidentifiable_value(self): value = None - deid = DeIdMapping.get_deid(value, None, domain='test-domain') + deid = DeIdMapping.get_deid(value, {'domain': 'test-domain'}) self.assertIsNotNone(deid) - - def test_uses_domain_from_doc(self): - doc = {'domain': 'doc-test-domain'} - - deid = DeIdMapping.get_deid('somedatavalue', doc) - deid_mapping = DeIdMapping.objects.get(deid=deid) - self.assertEqual(deid_mapping.domain, doc['domain']) diff --git a/corehq/apps/hqcase/utils.py b/corehq/apps/hqcase/utils.py index b2aee8bce920..62d4c19cff94 100644 --- a/corehq/apps/hqcase/utils.py +++ b/corehq/apps/hqcase/utils.py @@ -301,7 +301,7 @@ def get_deidentified_data(case, censor_data): censored_value = deid_date(case_value, None, key=case.case_id) if transform == DEID_ID_TRANSFORM: deid_id = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM]) - censored_value = deid_id(case_value, None, domain=case.domain) + censored_value = deid_id(case_value, case) if is_case_property: props[attr_or_prop] = censored_value