Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change de-identification method for Sensitive ID export fields #35772

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions corehq/apps/domain/deletion.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ def _delete_demo_user_restores(domain_name):
PartitionedModelDeletion('scheduling_partitioned', 'CaseTimedScheduleInstance', 'domain'),
PartitionedModelDeletion('scheduling_partitioned', 'TimedScheduleInstance', 'domain'),
ModelDeletion('domain', 'TransferDomainRequest', 'domain'),
ModelDeletion('export', 'DeIdMapping', 'domain'),
ModelDeletion('export', 'EmailExportWhenDoneRequest', 'domain'),
ModelDeletion('export', 'LedgerSectionEntry', 'domain'),
CustomDeletion('export', _delete_data_files, []),
Expand Down
1 change: 1 addition & 0 deletions corehq/apps/dump_reload/sql/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@
FilteredModelIteratorBuilder('email.EmailSettings', SimpleFilter('domain')),
FilteredModelIteratorBuilder('dhis2.SQLDataSetMap', SimpleFilter('domain')),
FilteredModelIteratorBuilder('dhis2.SQLDataValueMap', SimpleFilter('dataset_map__domain')),
FilteredModelIteratorBuilder('export.DeIdMapping', SimpleFilter('domain')),
]]


Expand Down
36 changes: 11 additions & 25 deletions corehq/apps/export/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,6 @@
Some of these constants correspond to constants set in corehq/apps/export/static/export/js/const.js
so if changing a value, ensure that both places reflect the change
"""
from couchexport.deid import deid_date, deid_ID

from corehq.apps.export.transforms import (
case_close_to_boolean,
case_id_to_case_name,
case_id_to_link,
case_or_user_id_to_name,
doc_type_transform,
form_id_to_link,
owner_id_to_display,
user_id_to_username,
workflow_transform,
)

# When fixing a bug that requires existing schemas to be rebuilt,
# bump the version number.
FORM_DATA_SCHEMA_VERSION = 10
Expand All @@ -25,8 +11,8 @@
DEID_ID_TRANSFORM = "deid_id"
DEID_DATE_TRANSFORM = "deid_date"
DEID_TRANSFORM_FUNCTIONS = {
DEID_ID_TRANSFORM: deid_ID,
DEID_DATE_TRANSFORM: deid_date,
DEID_DATE_TRANSFORM: 'deid_date',
DEID_ID_TRANSFORM: 'deid_ID',
}
CASE_NAME_TRANSFORM = "case_name_transform"
CASE_ID_TO_LINK = "case_link_transform"
Expand All @@ -38,15 +24,15 @@
CASE_OR_USER_ID_TRANSFORM = "case_or_user_id_transform"
CASE_CLOSE_TO_BOOLEAN = "case_close_to_boolean"
TRANSFORM_FUNCTIONS = {
CASE_NAME_TRANSFORM: case_id_to_case_name,
CASE_ID_TO_LINK: case_id_to_link,
FORM_ID_TO_LINK: form_id_to_link,
USERNAME_TRANSFORM: user_id_to_username,
OWNER_ID_TRANSFORM: owner_id_to_display,
WORKFLOW_TRANSFORM: workflow_transform,
DOC_TYPE_TRANSFORM: doc_type_transform,
CASE_OR_USER_ID_TRANSFORM: case_or_user_id_to_name,
CASE_CLOSE_TO_BOOLEAN: case_close_to_boolean,
CASE_NAME_TRANSFORM: 'case_id_to_case_name',
CASE_ID_TO_LINK: 'case_id_to_link',
FORM_ID_TO_LINK: 'form_id_to_link',
USERNAME_TRANSFORM: 'user_id_to_username',
OWNER_ID_TRANSFORM: 'owner_id_to_display',
WORKFLOW_TRANSFORM: 'workflow_transform',
DOC_TYPE_TRANSFORM: 'doc_type_transform',
CASE_OR_USER_ID_TRANSFORM: 'case_or_user_id_to_name',
CASE_CLOSE_TO_BOOLEAN: 'case_close_to_boolean',
}
PLAIN_USER_DEFINED_SPLIT_TYPE = 'plain'
MULTISELCT_USER_DEFINED_SPLIT_TYPE = 'multi-select'
Expand Down
39 changes: 39 additions & 0 deletions corehq/apps/export/migrations/0014_deidmapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Generated by Django 4.2.17 on 2025-02-12 00:46

from django.db import migrations, models
import uuid


class Migration(migrations.Migration):

dependencies = [
("export", "0013_rm_incrementalexport"),
]

operations = [
migrations.CreateModel(
name="DeIdMapping",
fields=[
(
"id",
models.AutoField(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How many of these do we expect to have? If the number grows very large, especially if there is a lot of churn, would it be worth using BigAutoField?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a sense of how widely the de-identified data feature is used, but it seems plausible we could run up against the limit using if a regular AutoField. If we consider exports from our larger projects that might have several hundred thousand unique cases, if each of those has a few different de-identified fields, and a few different projects of that scale use this feature, it would add up pretty quickly, so BigAutoField is likely worthwhile.

Copy link
Contributor

@millerdev millerdev Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This suggestion of using BigAutoField pointed in a direction I'm not really in favor of. Sorry for the confusion. I'm strongly in favor of using a different hashing technique that doesn't require a new database model, such as using a unique salt per domain.

auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("domain", models.TextField(max_length=255)),
("hashed_value", models.TextField(max_length=32)),
("deid", models.UUIDField(default=uuid.uuid4)),
],
options={
"indexes": [
models.Index(
fields=["domain", "hashed_value"],
name="export_deid_domain_3c63a4_idx",
)
],
},
),
]
4 changes: 4 additions & 0 deletions corehq/apps/export/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@
DataSourceExportInstance,
)

from .deid_export import (
DeIdMapping,
)

from .export_settings import (
DefaultExportSettings,
)
42 changes: 42 additions & 0 deletions corehq/apps/export/models/deid_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import hashlib
import uuid

from django.db import models

from corehq.const import ONE_DAY
from corehq.util.quickcache import quickcache


class DeIdMapping(models.Model):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do I have it right that the purpose of this model is to map values to UUIDs so that a given value always has the same UUID?

Could the model be eliminated by using the hash of the value + domain, possibly with a stronger hash function (SHA-256, for example)? That would make the get_deid function look something like:

from haslib import sha256

def get_deid(value, doc):
    domain = doc["domain"] if isinstance(doc, dict) else doc.domain
    data = f"{value} {domain}".encode("utf-8")
    return sha256(data).hexdigest()[:32]

We could also use something like a secret salt (possibly stored in settings.py, or even one per domain) to make it harder to re-identify the values, although that may be overkill.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yet another idea is to use the document ID as hash salt.

domain = models.TextField(max_length=255)
hashed_value = models.TextField(max_length=32)
deid = models.UUIDField(default=uuid.uuid4)
Comment on lines +12 to +13
Copy link
Contributor Author

@nospame nospame Feb 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had trouble coming up with good names for these fields, with fairly little context on export code in general. Open to alternatives here. Maybe something closer to hashed_item and deid_value?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we do stick with the db model, could we just make the primary key a uuid and use that instead of the deid field? (Code might not be 100% accurate)
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)


class Meta:
indexes = [
models.Index(fields=['domain', 'hashed_value']),
]

@classmethod
def get_deid(cls, value, doc):
if isinstance(doc, dict):
domain = doc['domain']
else:
domain = doc.domain
Comment on lines +24 to +25
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably good to have a test cover this code as well.

return cls._get_deid(value, domain)

@classmethod
@quickcache(['value', 'domain'], timeout=90 * ONE_DAY)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

90 days is a long time. If the code behind this cache changes, it will not take effect until months later (for cached values). Can you explain more about why such a long cache timeout was chosen?

Copy link
Contributor Author

@nospame nospame Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hadn't considered code changes effectively being delayed by cache, which is a good point, but the length of time is because these values are essentially expected to stay the same forever once stored. The reason for caching here generally is so we're not making excessive DB calls for every repeated "sensitive ID" value in an export.

Maybe there's a shorter timeout here that would be more convenient for consideration of working with this code in the future, but still mitigate most of the performance impact on exports. Maybe 1 month? 1 week?

def _get_deid(cls, value, domain):
hashed_value = cls._hash_value(value)
deid_mapping, __ = cls.objects.get_or_create(domain=domain, hashed_value=hashed_value)
return str(deid_mapping.deid)

@staticmethod
@quickcache(['value'], timeout=90 * ONE_DAY)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does the result of this function need to be cached?

My thoughts: The cache adds a network hit (to redis), but the function itself does not hit the network, so the cache would most likely make the function slower. Also, MD5 hashing should be very fast unless value is huge, but even in that case saving a huge value to the cache could be problematic.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I cached it here without thinking about the additional network hit, just that the return value will stay the same for a given input so we "might as well" cache it. I assume you're probably correct that simply hashing the value and returning it would be more performant than caching it here, so I'll make that change.

def _hash_value(value):
if value is None:
# None is a de-identifiable value but needs a string to encode for lookup
value = ''
Comment on lines +38 to +40
Copy link
Contributor Author

@nospame nospame Feb 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there's a meaningful difference between an empty string and None in data this would be de-identifying.

Previously you would get the same output for the both None and the string 'None' and this seems better to me, but if there's any reason to retain the previous behavior I can just call str(value) instead.


return hashlib.md5(value.encode('utf-8')).hexdigest()
12 changes: 9 additions & 3 deletions corehq/apps/export/models/new.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@
get_form_export_base_query,
get_sms_export_base_query,
)
from corehq.apps.export.utils import is_occurrence_deleted
from corehq.apps.export.utils import (
get_deid_transform_function,
get_transform_function,
is_occurrence_deleted,
)
from corehq.apps.locations.models import SQLLocation
from corehq.apps.products.models import SQLProduct
from corehq.apps.reports.daterange import get_daterange_start_end_dates
Expand Down Expand Up @@ -322,10 +326,12 @@ def _transform(self, value, doc, transform_dates):
if transform_dates:
value = couch_to_excel_datetime(value, doc)
if self.item.transform:
value = TRANSFORM_FUNCTIONS[self.item.transform](value, doc)
transform_function = get_transform_function(TRANSFORM_FUNCTIONS[self.item.transform])
value = transform_function(value, doc)
if self.deid_transform:
try:
value = DEID_TRANSFORM_FUNCTIONS[self.deid_transform](value, doc)
transform_function = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[self.deid_transform])
value = transform_function(value, doc)
except ValueError:
# Unable to convert the string to a date
pass
Expand Down
26 changes: 26 additions & 0 deletions corehq/apps/export/tests/test_deid_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from django.test import TestCase

from corehq.apps.export.models import DeIdMapping


class TestDeIdMapping(TestCase):
def test_deid_unique_by_domain(self):
value = 'somedatavalue'

deid_one = DeIdMapping.get_deid(value, {'domain': 'test-domain-1'})
deid_two = DeIdMapping.get_deid(value, {'domain': 'test-domain-2'})
self.assertNotEqual(deid_one, deid_two)

def test_deid_consistent_for_value_and_domain(self):
value = 'somedatavalue'
domain = 'test-domain'

deid_one = DeIdMapping.get_deid(value, {'domain': domain})
deid_two = DeIdMapping.get_deid(value, {'domain': domain})
self.assertEqual(deid_one, deid_two)

def test_none_is_a_deidentifiable_value(self):
value = None

deid = DeIdMapping.get_deid(value, {'domain': 'test-domain'})
self.assertIsNotNone(deid)
20 changes: 19 additions & 1 deletion corehq/apps/export/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import importlib
from django.http import Http404

from couchdbkit import ResourceNotFound

from corehq.apps.accounting.models import Subscription
from corehq.apps.accounting.utils import domain_has_privilege
from corehq.apps.export.const import (
DEID_DATE_TRANSFORM, DEID_ID_TRANSFORM, DEID_TRANSFORM_FUNCTIONS
)
from corehq.privileges import DAILY_SAVED_EXPORT, DEFAULT_EXPORT_SETTINGS, EXCEL_DASHBOARD
from corehq.toggles import MESSAGE_LOG_METADATA

Expand Down Expand Up @@ -57,10 +60,25 @@ def get_default_export_settings_if_available(domain):
"""
Only creates settings if the domain has the DEFAULT_EXPORT_SETTINGS privilege
"""
from corehq.apps.accounting.models import Subscription
settings = None
current_subscription = Subscription.get_active_subscription_by_domain(domain)
if current_subscription and domain_has_privilege(domain, DEFAULT_EXPORT_SETTINGS):
from corehq.apps.export.models import DefaultExportSettings
settings = DefaultExportSettings.objects.get_or_create(account=current_subscription.account)[0]

return settings


def get_transform_function(func_name):
module = importlib.import_module('corehq.apps.export.transforms')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is importlib necessary here? Does a normal import not work?

Suggested change
module = importlib.import_module('corehq.apps.export.transforms')
import corehq.apps.export.transforms as module

return getattr(module, func_name)
nospame marked this conversation as resolved.
Show resolved Hide resolved


def get_deid_transform_function(func_name):
if func_name == DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM]:
module = importlib.import_module('couchexport.deid')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
module = importlib.import_module('couchexport.deid')
import couchexport.deid as module

return getattr(module, func_name)
elif func_name == DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM]:
from corehq.apps.export.models import DeIdMapping
return DeIdMapping.get_deid
Copy link
Contributor

@millerdev millerdev Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function will return None when neither of the if-conditions are met. Is that intentional? Is that scenario covered by a test?

12 changes: 9 additions & 3 deletions corehq/apps/hqcase/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@

from casexml.apps.case.mock import CaseBlock
from casexml.apps.case.util import property_changed_in_action
from couchexport.deid import deid_date, deid_ID
from dimagi.utils.parsing import json_format_datetime

from corehq.apps.case_search.const import INDEXED_METADATA_BY_KEY
from corehq.apps.data_interfaces.deduplication import DEDUPE_XMLNS
from corehq.apps.es import filters
from corehq.apps.es.cases import CaseES
from corehq.apps.export.const import DEID_DATE_TRANSFORM, DEID_ID_TRANSFORM
from corehq.apps.export.const import (
DEID_DATE_TRANSFORM,
DEID_ID_TRANSFORM,
DEID_TRANSFORM_FUNCTIONS,
)
from corehq.apps.export.utils import get_deid_transform_function
from corehq.apps.receiverwrapper.util import submit_form_locally
from corehq.apps.users.util import SYSTEM_USER_ID
from corehq.form_processor.exceptions import CaseNotFound, MissingFormXml
Expand Down Expand Up @@ -293,9 +297,11 @@ def get_deidentified_data(case, censor_data):

censored_value = ''
if transform == DEID_DATE_TRANSFORM:
deid_date = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM])
censored_value = deid_date(case_value, None, key=case.case_id)
if transform == DEID_ID_TRANSFORM:
censored_value = deid_ID(case_value, None)
deid_id = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM])
censored_value = deid_id(case_value, case)
Comment on lines 299 to +304
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add error handling for de-identification function calls.

The de-identification function calls lack error handling, which could lead to runtime errors if the functions fail.

Add error handling:

 if transform == DEID_DATE_TRANSFORM:
-    deid_date = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM])
-    censored_value = deid_date(case_value, None, key=case.case_id)
+    try:
+        deid_date = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM])
+        censored_value = deid_date(case_value, None, key=case.case_id)
+    except Exception as e:
+        logger.error(f"Error applying date de-identification: {e}")
+        censored_value = ''
 if transform == DEID_ID_TRANSFORM:
-    deid_id = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM])
-    censored_value = deid_id(case_value, case)
+    try:
+        deid_id = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM])
+        censored_value = deid_id(case_value, case)
+    except Exception as e:
+        logger.error(f"Error applying ID de-identification: {e}")
+        censored_value = ''
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if transform == DEID_DATE_TRANSFORM:
deid_date = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM])
censored_value = deid_date(case_value, None, key=case.case_id)
if transform == DEID_ID_TRANSFORM:
censored_value = deid_ID(case_value, None)
deid_id = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM])
censored_value = deid_id(case_value, case)
if transform == DEID_DATE_TRANSFORM:
try:
deid_date = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_DATE_TRANSFORM])
censored_value = deid_date(case_value, None, key=case.case_id)
except Exception as e:
logger.error(f"Error applying date de-identification: {e}")
censored_value = ''
if transform == DEID_ID_TRANSFORM:
try:
deid_id = get_deid_transform_function(DEID_TRANSFORM_FUNCTIONS[DEID_ID_TRANSFORM])
censored_value = deid_id(case_value, case)
except Exception as e:
logger.error(f"Error applying ID de-identification: {e}")
censored_value = ''


if is_case_property:
props[attr_or_prop] = censored_value
Expand Down
1 change: 1 addition & 0 deletions migrations.lock
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,7 @@ export
0011_defaultexportsettings_usecouchfiletypes
0012_defaultexportsettings_remove_duplicates_option
0013_rm_incrementalexport
0014_deidmapping
fhir
0001_initial
0002_fhirresourcetype
Expand Down
Loading