diff --git a/corehq/apps/domain/deletion.py b/corehq/apps/domain/deletion.py index 9294e24fda66..f26c3280a74d 100644 --- a/corehq/apps/domain/deletion.py +++ b/corehq/apps/domain/deletion.py @@ -420,6 +420,7 @@ def _delete_demo_user_restores(domain_name): PartitionedModelDeletion('scheduling_partitioned', 'CaseTimedScheduleInstance', 'domain'), PartitionedModelDeletion('scheduling_partitioned', 'TimedScheduleInstance', 'domain'), ModelDeletion('domain', 'TransferDomainRequest', 'domain'), + ModelDeletion('export', 'DeIdMapping', 'domain'), ModelDeletion('export', 'EmailExportWhenDoneRequest', 'domain'), ModelDeletion('export', 'LedgerSectionEntry', 'domain'), CustomDeletion('export', _delete_data_files, []), diff --git a/corehq/apps/dump_reload/sql/dump.py b/corehq/apps/dump_reload/sql/dump.py index 7f949ee836b4..d1b16e298fe9 100644 --- a/corehq/apps/dump_reload/sql/dump.py +++ b/corehq/apps/dump_reload/sql/dump.py @@ -258,6 +258,7 @@ FilteredModelIteratorBuilder('email.EmailSettings', SimpleFilter('domain')), FilteredModelIteratorBuilder('dhis2.SQLDataSetMap', SimpleFilter('domain')), FilteredModelIteratorBuilder('dhis2.SQLDataValueMap', SimpleFilter('dataset_map__domain')), + FilteredModelIteratorBuilder('export.DeIdMapping', SimpleFilter('domain')), ]] diff --git a/corehq/apps/export/const.py b/corehq/apps/export/const.py index 6becd655542f..0feb81f86366 100644 --- a/corehq/apps/export/const.py +++ b/corehq/apps/export/const.py @@ -2,20 +2,6 @@ Some of these constants correspond to constants set in corehq/apps/export/static/export/js/const.js so if changing a value, ensure that both places reflect the change """ -from couchexport.deid import deid_date, deid_ID - -from corehq.apps.export.transforms import ( - case_close_to_boolean, - case_id_to_case_name, - case_id_to_link, - case_or_user_id_to_name, - doc_type_transform, - form_id_to_link, - owner_id_to_display, - user_id_to_username, - workflow_transform, -) - # When fixing a bug that requires existing schemas to be rebuilt, # bump the version number. FORM_DATA_SCHEMA_VERSION = 10 @@ -25,8 +11,8 @@ DEID_ID_TRANSFORM = "deid_id" DEID_DATE_TRANSFORM = "deid_date" DEID_TRANSFORM_FUNCTIONS = { - DEID_ID_TRANSFORM: deid_ID, - DEID_DATE_TRANSFORM: deid_date, + DEID_DATE_TRANSFORM: 'deid_date', + DEID_ID_TRANSFORM: 'deid_ID', } CASE_NAME_TRANSFORM = "case_name_transform" CASE_ID_TO_LINK = "case_link_transform" @@ -38,15 +24,15 @@ CASE_OR_USER_ID_TRANSFORM = "case_or_user_id_transform" CASE_CLOSE_TO_BOOLEAN = "case_close_to_boolean" TRANSFORM_FUNCTIONS = { - CASE_NAME_TRANSFORM: case_id_to_case_name, - CASE_ID_TO_LINK: case_id_to_link, - FORM_ID_TO_LINK: form_id_to_link, - USERNAME_TRANSFORM: user_id_to_username, - OWNER_ID_TRANSFORM: owner_id_to_display, - WORKFLOW_TRANSFORM: workflow_transform, - DOC_TYPE_TRANSFORM: doc_type_transform, - CASE_OR_USER_ID_TRANSFORM: case_or_user_id_to_name, - CASE_CLOSE_TO_BOOLEAN: case_close_to_boolean, + CASE_NAME_TRANSFORM: 'case_id_to_case_name', + CASE_ID_TO_LINK: 'case_id_to_link', + FORM_ID_TO_LINK: 'form_id_to_link', + USERNAME_TRANSFORM: 'user_id_to_username', + OWNER_ID_TRANSFORM: 'owner_id_to_display', + WORKFLOW_TRANSFORM: 'workflow_transform', + DOC_TYPE_TRANSFORM: 'doc_type_transform', + CASE_OR_USER_ID_TRANSFORM: 'case_or_user_id_to_name', + CASE_CLOSE_TO_BOOLEAN: 'case_close_to_boolean', } PLAIN_USER_DEFINED_SPLIT_TYPE = 'plain' MULTISELCT_USER_DEFINED_SPLIT_TYPE = 'multi-select' diff --git a/corehq/apps/export/migrations/0014_deidmapping.py b/corehq/apps/export/migrations/0014_deidmapping.py new file mode 100644 index 000000000000..635b2d4a059f --- /dev/null +++ b/corehq/apps/export/migrations/0014_deidmapping.py @@ -0,0 +1,39 @@ +# Generated by Django 4.2.17 on 2025-02-12 00:46 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ("export", "0013_rm_incrementalexport"), + ] + + operations = [ + migrations.CreateModel( + name="DeIdMapping", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("domain", models.TextField(max_length=255)), + ("hashed_value", models.TextField(max_length=32)), + ("deid", models.UUIDField(default=uuid.uuid4)), + ], + options={ + "indexes": [ + models.Index( + fields=["domain", "hashed_value"], + name="export_deid_domain_3c63a4_idx", + ) + ], + }, + ), + ] diff --git a/corehq/apps/export/models/__init__.py b/corehq/apps/export/models/__init__.py index bef7b31fa23b..7b3a351702e9 100644 --- a/corehq/apps/export/models/__init__.py +++ b/corehq/apps/export/models/__init__.py @@ -45,6 +45,10 @@ DataSourceExportInstance, ) +from .deid_export import ( + DeIdMapping, +) + from .export_settings import ( DefaultExportSettings, ) diff --git a/corehq/apps/export/models/deid_export.py b/corehq/apps/export/models/deid_export.py new file mode 100644 index 000000000000..73b77951cfca --- /dev/null +++ b/corehq/apps/export/models/deid_export.py @@ -0,0 +1,42 @@ +import hashlib +import uuid + +from django.db import models + +from corehq.const import ONE_DAY +from corehq.util.quickcache import quickcache + + +class DeIdMapping(models.Model): + domain = models.TextField(max_length=255) + hashed_value = models.TextField(max_length=32) + deid = models.UUIDField(default=uuid.uuid4) + + class Meta: + indexes = [ + models.Index(fields=['domain', 'hashed_value']), + ] + + @classmethod + def get_deid(cls, value, doc): + if isinstance(doc, dict): + domain = doc['domain'] + else: + domain = doc.domain + return cls._get_deid(value, domain) + + @classmethod + @quickcache(['value', 'domain'], timeout=90 * ONE_DAY) + def _get_deid(cls, value, domain): + hashed_value = cls._hash_value(value) + deid_mapping, __ = cls.objects.get_or_create(domain=domain, hashed_value=hashed_value) + return str(deid_mapping.deid) + + @staticmethod + @quickcache(['value'], timeout=90 * ONE_DAY) + def _hash_value(value): + if value is None: + # None is a de-identifiable value but needs a string to encode for lookup + value = '' + + return hashlib.md5(value.encode('utf-8')).hexdigest() diff --git a/corehq/apps/export/models/new.py b/corehq/apps/export/models/new.py index dd94f03cd70c..ecff324f7c06 100644 --- a/corehq/apps/export/models/new.py +++ b/corehq/apps/export/models/new.py @@ -99,7 +99,11 @@ get_form_export_base_query, get_sms_export_base_query, ) -from corehq.apps.export.utils import is_occurrence_deleted +from corehq.apps.export.utils import ( + get_deid_transform_function, + get_transform_function, + is_occurrence_deleted, +) from corehq.apps.locations.models import SQLLocation from corehq.apps.products.models import SQLProduct from corehq.apps.reports.daterange import get_daterange_start_end_dates @@ -322,10 +326,12 @@ def _transform(self, value, doc, transform_dates): if transform_dates: value = couch_to_excel_datetime(value, doc) if self.item.transform: - value = TRANSFORM_FUNCTIONS[self.item.transform](value, doc) + transform_function = get_transform_function(TRANSFORM_FUNCTIONS[self.item.transform]) + value = transform_function(value, doc) if self.deid_transform: try: - value = DEID_TRANSFORM_FUNCTIONS[self.deid_transform](value, doc) + transform_function = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[self.deid_transform]) + value = transform_function(value, doc) except ValueError: # Unable to convert the string to a date pass diff --git a/corehq/apps/export/tests/test_deid_export.py b/corehq/apps/export/tests/test_deid_export.py new file mode 100644 index 000000000000..1198db6f195e --- /dev/null +++ b/corehq/apps/export/tests/test_deid_export.py @@ -0,0 +1,26 @@ +from django.test import TestCase + +from corehq.apps.export.models import DeIdMapping + + +class TestDeIdMapping(TestCase): + def test_deid_unique_by_domain(self): + value = 'somedatavalue' + + deid_one = DeIdMapping.get_deid(value, {'domain': 'test-domain-1'}) + deid_two = DeIdMapping.get_deid(value, {'domain': 'test-domain-2'}) + self.assertNotEqual(deid_one, deid_two) + + def test_deid_consistent_for_value_and_domain(self): + value = 'somedatavalue' + domain = 'test-domain' + + deid_one = DeIdMapping.get_deid(value, {'domain': domain}) + deid_two = DeIdMapping.get_deid(value, {'domain': domain}) + self.assertEqual(deid_one, deid_two) + + def test_none_is_a_deidentifiable_value(self): + value = None + + deid = DeIdMapping.get_deid(value, {'domain': 'test-domain'}) + self.assertIsNotNone(deid) diff --git a/corehq/apps/export/utils.py b/corehq/apps/export/utils.py index 5e638f4944d2..e77a9c54c5c8 100644 --- a/corehq/apps/export/utils.py +++ b/corehq/apps/export/utils.py @@ -1,9 +1,12 @@ +import importlib from django.http import Http404 from couchdbkit import ResourceNotFound -from corehq.apps.accounting.models import Subscription from corehq.apps.accounting.utils import domain_has_privilege +from corehq.apps.export.const import ( + DEID_DATE_TRANSFORM, DEID_ID_TRANSFORM, DEID_TRANSFORM_FUNCTIONS +) from corehq.privileges import DAILY_SAVED_EXPORT, DEFAULT_EXPORT_SETTINGS, EXCEL_DASHBOARD from corehq.toggles import MESSAGE_LOG_METADATA @@ -57,6 +60,7 @@ def get_default_export_settings_if_available(domain): """ Only creates settings if the domain has the DEFAULT_EXPORT_SETTINGS privilege """ + from corehq.apps.accounting.models import Subscription settings = None current_subscription = Subscription.get_active_subscription_by_domain(domain) if current_subscription and domain_has_privilege(domain, DEFAULT_EXPORT_SETTINGS): @@ -64,3 +68,17 @@ def get_default_export_settings_if_available(domain): settings = DefaultExportSettings.objects.get_or_create(account=current_subscription.account)[0] return settings + + +def get_transform_function(func_name): + module = importlib.import_module('corehq.apps.export.transforms') + return getattr(module, func_name) + + +def get_deid_transform_function(func_name): + if func_name == DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM]: + module = importlib.import_module('couchexport.deid') + return getattr(module, func_name) + elif func_name == DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM]: + from corehq.apps.export.models import DeIdMapping + return DeIdMapping.get_deid diff --git a/corehq/apps/hqcase/utils.py b/corehq/apps/hqcase/utils.py index 9ad8ac3208de..62d4c19cff94 100644 --- a/corehq/apps/hqcase/utils.py +++ b/corehq/apps/hqcase/utils.py @@ -7,14 +7,18 @@ from casexml.apps.case.mock import CaseBlock from casexml.apps.case.util import property_changed_in_action -from couchexport.deid import deid_date, deid_ID from dimagi.utils.parsing import json_format_datetime from corehq.apps.case_search.const import INDEXED_METADATA_BY_KEY from corehq.apps.data_interfaces.deduplication import DEDUPE_XMLNS from corehq.apps.es import filters from corehq.apps.es.cases import CaseES -from corehq.apps.export.const import DEID_DATE_TRANSFORM, DEID_ID_TRANSFORM +from corehq.apps.export.const import ( + DEID_DATE_TRANSFORM, + DEID_ID_TRANSFORM, + DEID_TRANSFORM_FUNCTIONS, +) +from corehq.apps.export.utils import get_deid_transform_function from corehq.apps.receiverwrapper.util import submit_form_locally from corehq.apps.users.util import SYSTEM_USER_ID from corehq.form_processor.exceptions import CaseNotFound, MissingFormXml @@ -293,9 +297,11 @@ def get_deidentified_data(case, censor_data): censored_value = '' if transform == DEID_DATE_TRANSFORM: + deid_date = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM]) censored_value = deid_date(case_value, None, key=case.case_id) if transform == DEID_ID_TRANSFORM: - censored_value = deid_ID(case_value, None) + deid_id = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM]) + censored_value = deid_id(case_value, case) if is_case_property: props[attr_or_prop] = censored_value diff --git a/migrations.lock b/migrations.lock index a3bf57ba6f42..7cec7c58dc2a 100644 --- a/migrations.lock +++ b/migrations.lock @@ -465,6 +465,7 @@ export 0011_defaultexportsettings_usecouchfiletypes 0012_defaultexportsettings_remove_duplicates_option 0013_rm_incrementalexport + 0014_deidmapping fhir 0001_initial 0002_fhirresourcetype