forked from PovertyAction/PII_detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPII_data_processor.py
807 lines (578 loc) · 33.4 KB
/
PII_data_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
import restricted_words as restricted_words_list
import pandas as pd
# from nltk.stem.porter import PorterStemmer
import time
import numpy as np
from constant_strings import *
import urllib.request as urllib2
import api_queries
import find_piis_in_unstructured_text as unstructured_text
import fileinput
import shutil
import os
from datetime import date
import hash_generator
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
from os import listdir
from os.path import isfile, isdir, join
import ntpath
import shutil
OUTPUTS_FOLDER = None
LOG_FILE_PATH = None
def get_surveycto_restricted_vars():
return restricted_words_list.get_surveycto_restricted_vars()
def import_dataset(dataset_path):
dataset, label_dict, value_label_dict = False, False, False
raise_error = False
status_message = False
# if dataset_path.endswith(('"', "'")):
# dataset_path = dataset_path[1:-1]
# dataset_path_l = dataset_path.lower()
#Check format
if(dataset_path.endswith(('xlsx', 'xls','csv','dta')) is False):
return (False, 'Supported files are .csv, .dta, .xlsx, .xls')
try:
if dataset_path.endswith(('xlsx', 'xls')):
dataset = pd.read_excel(dataset_path)
elif dataset_path.endswith('csv'):
dataset = pd.read_csv(dataset_path)
elif dataset_path.endswith('dta'):
try:
dataset = pd.read_stata(dataset_path)
except ValueError:
dataset = pd.read_stata(dataset_path, convert_categoricals=False)
label_dict = pd.io.stata.StataReader(dataset_path).variable_labels()
try:
value_label_dict = pd.io.stata.StataReader(dataset_path).value_labels()
except AttributeError:
status_message = "No value labels detected. " # Not printed in the app, overwritten later.
elif dataset_path.endswith(('xpt', '.sas7bdat')):
dataset = pd.read_sas(dataset_path)
elif dataset_path.endswith('vc'):
status_message = "**ERROR**: This folder appears to be encrypted using VeraCrypt."
raise Exception
elif dataset_path.endswith('bc'):
status_message = "**ERROR**: This file appears to be encrypted using Boxcryptor. Sign in to Boxcryptor and then select the file in your X: drive."
raise Exception
else:
raise Exception
except (FileNotFoundError, Exception):
if status_message is False:
status_message = '**ERROR**: This path appears to be invalid. If your folders or filename contain colons or commas, try renaming them or moving the file to a different location.'
raise
if (status_message):
log_and_print("There was an error")
log_and_print(status_message)
return (False, status_message)
log_and_print('The dataset has been read successfully.\n')
dataset_read_return = [dataset, dataset_path, label_dict, value_label_dict]
return (True, dataset_read_return)
def word_match(column_name, restricted_word, type_of_matching=STRICT):
if(type_of_matching == STRICT):
return column_name.lower() == restricted_word.lower()
else: # type_of_matching == FUZZY
#Check if restricted word is inside column_name
return restricted_word.lower() in column_name.lower()
def remove_other_refuse_and_dont_know(column):
#List of values to remove. All numbers with 3 digits where all digits are the same
values_to_remove = [str(111*i) for i in range(-9,10) if i !=0]
filtered_column = column[~column.isin(values_to_remove)]
return filtered_column
def clean_column(column):
#Drop NaNs
column_filtered = column.dropna()
#Remove empty entries
column_filtered = column_filtered[column_filtered!='']
#Remove other, refuses and dont knows
if len(column_filtered)!=0:
column_filtered = remove_other_refuse_and_dont_know(column_filtered)
return column_filtered
def column_is_sparse(dataset, column_name, sparse_threshold):
column_filtered = clean_column(dataset[column_name])
#Check sparcity
n_entries = len(column_filtered)
n_unique_entries = column_filtered.nunique()
if n_entries != 0 and n_unique_entries/n_entries > sparse_threshold:
return True
else:
return False
def column_has_sufficiently_sparse_strings(dataset, column_name, sparse_threshold=0.2):
'''
Checks if 'valid' column entries are sparse, defined as ratio between unique_entries/total_entries.
Consider only valid stands, aka, exludet NaN, '', Other, Refuse to respond, Not Know
'''
#Check if column type is string
if dataset[column_name].dtypes == 'object':
return column_is_sparse(dataset, column_name, sparse_threshold)
else:
return False
def column_has_sparse_value_label_dicts(column_name, value_label_dict, sparse_threshold = 10):
'''
Check if for a given column, its values come encoded in a dictionary and are sufficiently sparse
'''
if column_name in value_label_dict and value_label_dict[column_name] != '' and len(value_label_dict[column_name])>sparse_threshold:
return True
else:
return False
def find_piis_based_on_column_name(dataset, label_dict, value_label_dict, columns_to_check, consider_locations_cols):
#Identifies columns whose names or labels match (strict or fuzzy) any word in the predefined list of restricted words. Also considers that data entries must be sufficiently sparse strings (Ideally, this method will capture columns with people names) or value label dictionaries (for locations)
pii_strict_restricted_words = restricted_words_list.get_strict_restricted_words()
pii_fuzzy_restricted_words = restricted_words_list.get_fuzzy_restricted_words()
#If consider_locations_cols = 1, then consider locations columns in the search
if(consider_locations_cols == 1):
#If we are not checking locations populations, then include locations columns as part of restricted words
locations_strict_restricted_words = restricted_words_list.get_locations_strict_restricted_words()
locations_fuzzy_restricted_words = restricted_words_list.get_locations_fuzzy_restricted_words()
pii_strict_restricted_words = set(pii_strict_restricted_words + locations_strict_restricted_words)
pii_fuzzy_restricted_words = set(pii_fuzzy_restricted_words + locations_fuzzy_restricted_words)
#We will save all restricted words in a dictionary, where the keys are the words and their values is if we are looking for a strict or fuzzy matching with that word
restricted_words = {}
for word in pii_strict_restricted_words:
restricted_words[word] = STRICT
for word in pii_fuzzy_restricted_words:
restricted_words[word] = FUZZY
# Looks for matches between column names (and labels) to restricted words
possible_pii = {}
#For every column name in our dataset
for column_name in columns_to_check:
#For every restricted word
for restricted_word, type_of_matching in restricted_words.items():
#Check if restricted word is in the column name
column_name_match = word_match(column_name, restricted_word, type_of_matching)
#If there is a dictionary of labels, check match with label
if label_dict is not False: #label_dict will be False in case of no labels
column_label = label_dict[column_name]
column_label_match = word_match(column_label, restricted_word, type_of_matching)
else:
column_label_match = False
#If there was a match between column name or label with restricted word
if column_name_match or column_label_match:
#If there was a strict match with restricted word
if type_of_matching == STRICT:
log_and_print("Column '"+column_name+"' considered possible pii given column name had a "+type_of_matching+" match with restricted word '"+ restricted_word+"'")
possible_pii[column_name] = "Name had "+ type_of_matching + " match with restricted word '"+restricted_word+"'"
#If column has strings and is sparse
elif column_has_sufficiently_sparse_strings(dataset, column_name):
#Log result and save column as possible pii. Theres different log depending if match was with column or label
if(column_name_match):
log_and_print("Column '"+column_name+"' considered possible pii given column name had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and has sufficiently sparse strings")
possible_pii[column_name] = "Name had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and has sufficiently sparse strings"
elif(column_label_match):
log_and_print("Column '"+column_name+ "' considered possible pii given column label '"+column_label+"' had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and has sufficiently sparse strings")
possible_pii[column_name] = "Label had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and has sufficiently sparse strings"
#If found, I dont need to keep checking this column with other restricted words
break
#Else, check if column has values labels (locations are usually stores this way)
elif column_has_sparse_value_label_dicts(column_name, value_label_dict):
if(column_name_match):
log_and_print("Column '"+column_name+"' considered possible pii given column name had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and values labels are sparse")
possible_pii[column_name] = "Name had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and values labels are sparse"
elif(column_label_match):
log_and_print("Column '"+column_name+ "' considered possible pii given column label '"+column_label+"' had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and values labels are sparse")
possible_pii[column_name] = "Label had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and values labels are sparse"
#If found, I dont need to keep checking this column with other restricted words
break
return possible_pii
def column_has_locations_with_low_populations(dataset, column_name, country):
column_filtered = clean_column(dataset[column_name])
#Get unique values
unique_locations = column_filtered.unique().tolist()
return api_queries.get_locations_with_low_population(unique_locations, country=country, return_one=True)
def log_and_print(message):
file = open(LOG_FILE_PATH, "a")
file.write(message+'\n')
file.close()
print(message)
def log_and_print(message):
file = open(LOG_FILE_PATH, "a")
file.write(message+'\n')
file.close()
print(message)
def find_piis_based_on_locations_population(dataset, label_dict, columns_to_check, country):
#Identifies columns whose names or labels match (strict or fuzzy) words related to locations. Then, check if for those columns, any value relates to a location with population under 20,000. If it is the case, then it flags the column.
#Lots of repeated code respect to find_piis_based_on_column_name, could refactor.
locations_strict_restricted_words = restricted_words_list.get_locations_strict_restricted_words()
locations_fuzzy_restricted_words = restricted_words_list.get_locations_fuzzy_restricted_words()
#We will save all restricted words in a dictionary, where the keys are the words and their values is if we are looking for a strict or fuzzy matching with that word
restricted_words = {}
for word in locations_strict_restricted_words:
restricted_words[word] = STRICT
for word in locations_fuzzy_restricted_words:
restricted_words[word] = FUZZY
# Looks for matches between column names (and labels) to restricted words
possible_pii = {}
#For every column name in our dataset
for column_name in columns_to_check:
#For every restricted word
for restricted_word, type_of_matching in restricted_words.items():
#Check if restricted word is in the column name
column_name_match = word_match(column_name, restricted_word, type_of_matching)
#If there is a dictionary of labels, check match with label
if label_dict is not False: #label_dict will be False in case of no labels
column_label = label_dict[column_name]
column_label_match = word_match(column_label, restricted_word, type_of_matching)
else:
column_label_match = False
#If there was a match between column name or label with restricted word
if column_name_match or column_label_match:
location_with_low_population = column_has_locations_with_low_populations(dataset, column_name, country)
if(location_with_low_population):
#Log result and save column as possible pii. Theres different log depending if match was with column or label
if(column_name_match):
log_and_print("Column '"+column_name+"' considered possible pii given column name had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and has a location with population under 20,000: "+location_with_low_population)
possible_pii[column_name] = "Name had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and has a location with population under 20,000: "+location_with_low_population
elif(column_label_match):
log_and_print("Column '"+column_name+ "' considered possible pii given column label '"+column_label+"' had a "+type_of_matching+" match with restricted word '"+ restricted_word+"' and has a location with population under 20,000: "+location_with_low_population)
possible_pii[column_name] = "Label had "+ type_of_matching + " match with restricted word '"+restricted_word+"' and has a location with population under 20,000: "+location_with_low_population
#If found, I dont need to keep checking this column with other restricted words
break
return possible_pii
def find_piis_based_on_sparse_entries(dataset, label_dict, columns_to_check, sparse_values_threshold=0.3):
#Identifies pii based on columns having sparse values
possible_pii={}
for column_name in columns_to_check:
if column_is_sparse(dataset, column_name, sparse_threshold=sparse_values_threshold):
log_and_print("Column '"+column_name+"' considered possible pii given entries are sparse")
possible_pii[column_name] = "Column entries are too sparse"
return possible_pii
def find_columns_with_specific_format(dataset, format_to_search, columns_to_check):
columns_with_phone_numbers = {}
if format_to_search == PHONE_NUMBER:
regex_expression = ".*(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}).*"
elif format_to_search == DATE:
#dd/mm/yy, (with -, / or .)
regex_date_1 = "((0[1-9]|[12]\d|3[01])(\/|-|\.)(0[1-9]|1[0-2])(\/|-|\.)[12]\d{3})"
#mm/dd/yyy, (with -, / or .)
regex_date_2 = "((0[1-9]|1[0-2])(\/|-|\.)(0[1-9]|[12]\d|3[01])(\/|-|\.)[12]\d{3})"
#yyyy/mm/dd, (with -, / or .)
regex_date_3 = "([12]\d{3}(\/|-|\.)(0[1-9]|1[0-2])(\/|-|\.)(0[1-9]|[12]\d|3[01]))"
regex_expression = regex_date_1+'|'+regex_date_2+'|'+regex_date_3
for column in columns_to_check:
#Check that all values in column are not NaN
if(pd.isnull(dataset[column]).all() == False):
#Find first 10 values that are not NaN nor empty space ''
column_with_no_nan = dataset[column].dropna()
column_with_no_empty_valyes = column_with_no_nan[column_with_no_nan != '']
first_10_values = column_with_no_empty_valyes.iloc[0:10]
match_result = first_10_values.astype(str).str.match(pat = regex_expression)
#If all not NaN values matched with regex, save column as PII candidate
if(any(match_result)):
log_and_print("Column '"+column+"' considered possible pii given column entries have "+format_to_search+" format")
columns_with_phone_numbers[column]= "Column entries have "+format_to_search+" format"
return columns_with_phone_numbers
def export_encoding(dataset_path, encoding_dict):
dataset_complete_file_name = ntpath.basename(dataset_path)
dataset_file_name_no_extension, dataset_type = os.path.splitext(dataset_complete_file_name)
encoding_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_encodingmap.csv')
def save_all_piis_in_txt_file(list_variables_to_drop, list_variables_to_encode):
all_piis_txt_file = os.path.join(OUTPUTS_FOLDER,'all_piis_identified.txt')
delete_if_exists(all_piis_txt_file)
file = open(all_piis_txt_file, "a")
if len(list_variables_to_drop)>0:
file.write(f'Columns to drop: {" ".join(list_variables_to_drop)}\n')
if len(list_variables_to_encode)>0:
file.write(f'Columns to encode: {" ".join(list_variables_to_encode)}')
file.close()
def create_deidentifying_do_file(dataset_path, pii_candidates_to_action):
'''
Using anonymize_script_tempalte.txt as a starting point, we create a .do file that deidentifies dataset according to pii_candidates_to_action
'''
#Make a copy of the template file
template_file = 'anonymize_script_template_v2.do'
script_filename= os.path.join(OUTPUTS_FOLDER, 'anonymize_script.do')
delete_if_exists(script_filename)
shutil.copyfile(template_file, script_filename)
deidentified_dataset_path = dataset_path.split('.')[0] + '_deidentified.dta'
#Create list of vars to drop and encode
list_variables_to_drop = []
list_variables_to_encode = []
for pii_candidate, action in pii_candidates_to_action.items():
if action == 'Drop':
list_variables_to_drop.append(pii_candidate)
elif action == 'Encode':
list_variables_to_encode.append(pii_candidate)
#Read all lines and replace whenever we find one of the keywords
with fileinput.FileInput(script_filename, inplace=True) as file: #, backup='.bak'
today_string = date.today().strftime("%m/%d/%y")
for line in file:
#Create modified_line
modified_line = line
modified_line = modified_line.replace('[date]', today_string)
modified_line = modified_line.replace('[input_file_path]', dataset_path)
modified_line = modified_line.replace('[output_file_path]', deidentified_dataset_path)
modified_line = modified_line.replace('[list_variables_to_drop_space_delimited]', " ".join(list_variables_to_drop))
modified_line = modified_line.replace('[list_variables_to_hash_space_delimited]', " ".join(list_variables_to_encode))
#The template .do file has an option to only remove value labels, we are not using that option so we will by default select no variables for that.
modified_line = modified_line.replace('[list_variables_to_remove_value_labelling_space_delimited]', "")
#Save modified line in file
#print here will print in the file, not actually printing in console
print(modified_line, end='')
#Write down list of variables in a document
save_all_piis_in_txt_file(list_variables_to_drop, list_variables_to_encode)
def delete_if_exists(file_path):
if os.path.exists(file_path):
os.remove(file_path)
def export_encoding(dataset_path, encoding_dict):
encoding_file_path = dataset_path.split('.')[0] + '_encodingmap.csv'
#Delete if file exists
delete_if_exists(encoding_file_path)
encoding_df = pd.DataFrame(columns=['variable','orginial value', 'encoded value'])
for variable, values_dict in encoding_dict.items():
for original_value, encoded_value in values_dict.items():
encoding_df.loc[-1] = [variable, original_value, encoded_value]
encoding_df.index = encoding_df.index + 1
encoding_df.to_csv(encoding_file_path, index=False)
def create_anonymized_dataset(dataset, label_dict, dataset_path, pii_candidate_to_action, columns_where_to_replace_piis = None, piis_found_in_ustructured_text = None):
#Drop columns
columns_to_drop = [column for column in pii_candidate_to_action if pii_candidate_to_action[column]=='Drop']
dataset = dataset.drop(columns=columns_to_drop)
log_and_print("Dropped columns: "+ " ".join(columns_to_drop))
#Encode columns
columns_to_encode = [column for column in pii_candidate_to_action if pii_candidate_to_action[column]=='Encode']
if(len(columns_to_encode)>0):
log_and_print("Hashed columns: "+ " ".join(columns_to_encode))
dataset, encoding_used = recode(dataset, columns_to_encode)
log_and_print("Map file for encoded values created.")
export_encoding(dataset_path, encoding_used)
#Replace piis in unstructured text
if(columns_where_to_replace_piis and piis_found_in_ustructured_text):
for c in columns_where_to_replace_piis:
dataset[c].replace(piis_found_in_ustructured_text, 'XXXX', regex=True, inplace=True)
exported_file_path = export(dataset, dataset_path, label_dict)
return exported_file_path
def find_survey_cto_vars(dataset):
surveycto_vars = restricted_words_list.get_surveycto_vars()
possible_pii = {}
#For every column name in our dataset
for column_name in dataset.columns:
#For every restricted word
for restricted_word in surveycto_vars:
#Check if restricted word is in the column name
if word_match(column_name, restricted_word):
possible_pii[column_name] = 'SurveyCTO variable'
return possible_pii
def find_piis_based_on_column_format(dataset, label_dict, columns_to_check):
all_piis_detected = {}
#Find columns with phone numbers formats
columns_with_phone_numbers = find_columns_with_specific_format(dataset, PHONE_NUMBER, columns_to_check)
all_piis_detected.update(columns_with_phone_numbers)
columns_with_dates = find_columns_with_specific_format(dataset, DATE, columns_to_check)
all_piis_detected.update(columns_with_dates)
return all_piis_detected
def create_outputs_folder(dataset_path):
directory_path = os.path.dirname(dataset_path)
global OUTPUTS_FOLDER
OUTPUTS_FOLDER = directory_path+'/pii_detection_outputs'
if os.path.exists(OUTPUTS_FOLDER):
shutil.rmtree(OUTPUTS_FOLDER)
os.mkdir(OUTPUTS_FOLDER)
def create_log_file_path(dataset_path):
global LOG_FILE_PATH
LOG_FILE_PATH = OUTPUTS_FOLDER+"/log.txt"
delete_if_exists(LOG_FILE_PATH)
def import_file(dataset_path):
#Create outputs folder and log file
create_outputs_folder(dataset_path)
#Create log file
create_log_file_path(dataset_path)
#Read file
import_status, import_result = import_dataset(dataset_path)
#Check if error ocurr
if import_status is False:
return import_status, import_result
#If no error, decouple import result
dataset, dataset_path, label_dict, value_label_dict = import_result
#Save results in dictionary for return
response_content = {}
response_content[DATASET] = dataset
response_content[LABEL_DICT] = label_dict
response_content[VALUE_LABEL_DICT] = value_label_dict
return True, response_content
def recode(dataset, columns_to_encode):
#Keep record of encoding
econding_used = {}
for var in columns_to_encode:
#For hashing, we will use hmac-sha1, then sort the hashed values and assign values 1-n.
# Make dictionary of old and new values.
#First there is a step between
unique_val_to_hmacsha1 = {}
hmacsha1_to_final_hash = {}
for unique_val in dataset[var].dropna().unique():
unique_val_to_hmacsha1[unique_val] = hash_generator.hmac_sha1('[SECRET KEY]', unique_val)
#Get list of all hmac-sha1 hashes and sort them
sorted_hash = [v for k, v in sorted(unique_val_to_hmacsha1.items(), key=lambda item: item[1])]
#Create dict that points from hmac-sha1 hashes to a 1-n value
hmacsha1_to_final_hash = {}
for index, hash in enumerate(sorted_hash):
hmacsha1_to_final_hash[hash]=index+1
#Join two dictionaries
unique_val_to_final_hash = {}
for k, v in unique_val_to_hmacsha1.items():
unique_val_to_final_hash[k] = hmacsha1_to_final_hash[v]
#Replace column with its hashes. First create list of all hashed values
hashed_column = []
for value in dataset[var].tolist():
if value is np.nan:
hashed_column.append(np.nan)
else:
hashed_column.append(unique_val_to_final_hash[value])
dataset[var] = hashed_column
print(var + ' has been successfully encoded.')
econding_used[var] = unique_val_to_final_hash
return dataset, econding_used
def find_piis_unstructured_text(dataset, label_dict, columns_still_to_check, language, country):
#Filter columns to those that have sparse entries
columns_to_check = []
for column_name in columns_still_to_check:
if column_has_sufficiently_sparse_strings(dataset, column_name):
columns_to_check.append(column_name)
pii_candidates_unstructured_text = unstructured_text.find_piis(dataset, label_dict, columns_to_check, language, country)
log_and_print(f'Piis found in columns {columns_to_check} with unstructured text: {pii_candidates_unstructured_text}')
return pii_candidates_unstructured_text, columns_to_check
def input_file_is_dta(dataset_path):
dataset_file_name_no_extension, dataset_type = os.path.splitext(dataset_path)
if dataset_type == '.dta':
return True
else:
return False
def export(dataset, dataset_path, variable_labels = None):
dataset_complete_file_name = ntpath.basename(dataset_path)
dataset_file_name_no_extension, dataset_type = os.path.splitext(dataset_complete_file_name)
if(dataset_type == '.csv'):
new_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_deidentified.csv')
delete_if_exists(new_file_path)
dataset.to_csv(new_file_path, index=False)
elif(dataset_type == '.dta'):
new_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_deidentified.dta')
delete_if_exists(new_file_path)
try:
dataset.to_stata(new_file_path, variable_labels = variable_labels, write_index=False)
except:
dataset.to_stata(new_file_path, version = 118, variable_labels = variable_labels, write_index=False)
elif(dataset_type == '.xlsx'):
new_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_deidentified.xlsx')
delete_if_exists(new_file_path)
dataset.to_excel(new_file_path, index=False)
elif(dataset_type == '.xls'):
new_file_path = os.path.join(OUTPUTS_FOLDER, dataset_file_name_no_extension + '_deidentified.xls')
delete_if_exists(new_file_path)
dataset.to_excel(new_file_path, index=False)
else:
log_and_print("Data type not supported")
new_file_path = None
return new_file_path
def internet_on():
try:
urllib2.urlopen('http://google.com', timeout=2)
return True
except Exception as e:
log_and_print(e)
return False
def get_directories_path_in_folder(folder_path):
only_directories = [join(folder_path, f) for f in listdir(folder_path) if isdir(join(folder_path, f))]
return only_directories
def get_files_path_in_folder(folder_path):
only_files = [join(folder_path, f) for f in listdir(folder_path) if isfile(join(folder_path, f))]
return only_files
def get_testing_tuple(folder_path):
only_files = get_files_path_in_folder(folder_path)
data_source = None
excel_with_ground_truth_pii = None
country_file = None
for file in only_files:
if file.split('.')[-1]=='dta':
data_source = file
continue
if file.split('-')[-1]=='true_piis.xlsx':
excel_with_ground_truth_pii = file
continue
if file.split('.')[-1]=='txt':
country_file = file
continue
if data_source and excel_with_ground_truth_pii and country_file:
return True, (data_source, excel_with_ground_truth_pii, country_file)
else:
return False, False
def get_test_files_tuples():
all_test_files_tuples = []
#Look for files in X:\Box Sync\GRDS_Resources\Data Science\Test data\Raw\
#For every folder inside, if folder has .dta and .xlsx ending with -piis.xlsx, add it to list
folder_with_raw_data = 'X:\Box Sync\GRDS_Resources\Data Science\Test data\Raw'
only_directories = get_directories_path_in_folder(folder_with_raw_data)
for dir in only_directories:
#Check that dir has .dta and .xls
dir_has_testing_tuple, testing_tuple = get_testing_tuple(dir)
if dir_has_testing_tuple:
all_test_files_tuples.append((testing_tuple[0], testing_tuple[1], testing_tuple[2]))
return all_test_files_tuples
def get_country(country_file_path):
with open(country_file_path) as f:
lines = f.readlines()
return lines[0]
def run_tests():
test_files_tuples = get_test_files_tuples()
for test_files_tuple in test_files_tuples:
dataset_path, true_piis_path, country_file_path = test_files_tuple
country = get_country(country_file_path)
print(f'RUNNING TEST FOR {dataset_path}.\nCountry {country}')
#Import dataset
reading_status, reading_content = import_file(dataset_path)
#Check if reading was succesful
if(reading_status is False):
return
dataset = reading_content[DATASET]
label_dict = reading_content[LABEL_DICT]
value_label_dict = reading_content[VALUE_LABEL_DICT]
columns_still_to_check = [c for c in dataset.columns if c not in restricted_words_list.get_surveycto_restricted_vars()]
#Search piis using all methods
all_piis_found = {}
#Options
consider_locations_cols = 1
search_pii_in_unstructured_text = 0
pii_candidates = find_piis_based_on_column_name(dataset, label_dict, value_label_dict, columns_still_to_check, consider_locations_cols)
all_piis_found.update(pii_candidates)
columns_still_to_check = [c for c in columns_still_to_check if c not in pii_candidates]
log_and_print("Piis found using column names: "+",".join(pii_candidates.keys()))
if(consider_locations_cols==0):
pii_candidates = find_piis_based_on_locations_population(dataset, label_dict, columns_still_to_check, country)
all_piis_found.update(pii_candidates)
columns_still_to_check = [c for c in columns_still_to_check if c not in pii_candidates]
log_and_print("Piis found basen on locations with low population: "+",".join(pii_candidates.keys()))
pii_candidates = find_piis_based_on_column_format(dataset, label_dict, columns_still_to_check)
all_piis_found.update(pii_candidates)
columns_still_to_check = [c for c in columns_still_to_check if c not in pii_candidates]
log_and_print("Piis found using column formats: "+",".join(pii_candidates.keys()))
if search_pii_in_unstructured_text == 0:
pii_candidates_unstructured_text = None
column_with_unstructured_text = None
pii_candidates = find_piis_based_on_sparse_entries(dataset, label_dict, columns_still_to_check)
all_piis_found.update(pii_candidates)
log_and_print("Piis based on sparse entries: "+",".join(pii_candidates.keys()))
else:
pii_candidates_unstructured_text, column_with_unstructured_text = find_piis_unstructured_text(dataset, label_dict, columns_still_to_check, SPANISH, MEXICO)
log_and_print("Piis found in unstructured text: "+",".join(pii_candidates_unstructured_text))
log_and_print(len(pii_candidates_unstructured_text))
#Create fake pii_candidate_to_action
pii_candidate_to_action = {}
for pii in pii_candidates:
pii_candidate_to_action[pii] = 'Drop'
#Create deidentified dataset
create_anonymized_dataset(dataset, label_dict, dataset_path, pii_candidate_to_action, pii_candidates_unstructured_text, column_with_unstructured_text)
#Now we check identified PIIs are the correct ones based on ground truth
reading_status, reading_content = import_file(true_piis_path)
if(reading_status is False):
return
true_piis_dataset = reading_content[DATASET]
true_piis = true_piis_dataset.iloc[:,0].to_list()
#Announce wrongly detected ppis
print("THE FOLLOWING PIIS WERE WRONGLY DETECTED:")
wrongly_detected = [pii for pii in all_piis_found.keys() if pii not in true_piis]
print(wrongly_detected)
#Announce missing piis
print("THE FOLLOWING PIIS WERE NOT DETECTED:")
not_detected = [pii for pii in true_piis if pii not in all_piis_found.keys()]
print(not_detected)
if __name__ == "__main__":
run_tests()