Merge pull request #122 from Emory-HITI/dev

Merge dev to master
Emory-HITI · Apr 9, 2021 · 0c32a7d · 0c32a7d
2 parents 8e1146b + 74707f1
commit 0c32a7d
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ Niffler enables receiving DICOM images real-time as a data stream from PACS as w
 
 # Configure Niffler
 
-Niffler consists of 4 modules, inside the modules folder. Here we will look into the common configuration and installation steps of Niffler. An introduction to Niffler can be found [here](https://emory-hiti.github.io/Niffler/).
+Niffler consists of 5 modules, inside the modules folder. Here we will look into the common configuration and installation steps of Niffler. An introduction to Niffler can be found [here](https://emory-hiti.github.io/Niffler/).
 
 ## Configure PACS
 

diff --git a/docs/index.md b/docs/index.md
@@ -21,6 +21,10 @@ Receives DICOM images as a stream from a PACS and extracts and stores the metada
 
 Converts a set of DICOM images into png images, extract metadata in a privacy-preserving manner. The extracted metadata is stored in a CSV file, along with the de-identified PNG images. The mapping of PNG files and their respective metadata is stored in a separate CSV file.
 
+## dicom-anonymization
+
+Converts a set of DICOM images into anonymized DICOM images, stripping off the PHI. 
+
 ## app-layer
 
 The app-layer (application layer) consists of specific algorithms. The app-layer/src/main/scripts consists of Javascript scripts such as scanner clock calibration. The app-layer/src/main/java consists of the the scanner utilization computation algorithms developed in Java.

diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py
@@ -131,23 +131,28 @@ def initialize():
     reader = csv.reader(f)
     next(f)
     for row in reader:
+        row = [x.strip() for x in row]
         if (extraction_type == 'empi_date'):
-            patients.append(row[patient_index])
-            temp_date = row[date_index]
-            dt_stamp = datetime.datetime.strptime(temp_date, date_format)
-            date_str = dt_stamp.strftime('%Y%m%d')
-            dates.append(date_str)
-            length = len(patients)
+            if not ((row[patient_index] == "") or (row[date_index] == "")):
+                patients.append(row[patient_index])
+                temp_date = row[date_index]
+                dt_stamp = datetime.datetime.strptime(temp_date, date_format)
+                date_str = dt_stamp.strftime('%Y%m%d')
+                dates.append(date_str)
+                length = len(patients)
         elif (extraction_type == 'empi'):
-            patients.append(row[patient_index])
-            length = len(patients)
+            if not ((row[patient_index] == "")):
+                patients.append(row[patient_index])
+                length = len(patients)
         elif (extraction_type == 'accession'):
-            accessions.append(row[accession_index])
-            length = len(accessions)
+            if not ((row[accession_index] == "")):
+                accessions.append(row[accession_index])
+                length = len(accessions)
         elif (extraction_type == 'empi_accession'):
-            patients.append(row[patient_index])
-            accessions.append(row[accession_index])
-            length = len(accessions)
+            if not ((row[patient_index] == "") or (row[accession_index] == "")):
+                patients.append(row[patient_index])
+                accessions.append(row[accession_index])
+                length = len(accessions)
 
 
 # Run the retrieval only once, when the extraction script starts, and keep it running in a separate thread.
@@ -277,4 +282,4 @@ def run_threaded(job_func):
         time.sleep(1)
     except KeyboardInterrupt:
         check_kill_process()
-        sys.exit(0)
+        sys.exit(0)
diff --git a/modules/png-extraction/anon_pydicom.py → ...es/dicom-anonymization/DicomAnonymizer.py b/modules/png-extraction/anon_pydicom.py → ...es/dicom-anonymization/DicomAnonymizer.py
@@ -10,7 +10,6 @@
 import sys
 import pydicom
 import random
-import shutil
 import pickle
 
 
@@ -102,20 +101,17 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
             test_file = pydicom.dcmread(test_file_path)
             anon_id = anonSample(test_file, 'StudyInstanceUID', UIDs['StudyInstanceUID'])
             # make folder with the anonymized studyUID name
-            print(anon_id)
             study_folder = os.path.join(output_path, anon_id)
             os.mkdir(study_folder)
             for file in files:
-                # copy the file to the new anon folder
-                shutil.copyfile(os.path.join(dcm_folder, file), os.path.join(study_folder, file))
-                dcm_file = pydicom.dcmread(os.path.join(study_folder, file))
+                dcm_file = pydicom.dcmread(os.path.join(dcm_folder, file))
                 dcm_file.remove_private_tags()
                 for UID in UIDs.keys():
                     # get the UID and get the anonymized UID
                     anon_id = anonSample(dcm_file, UID, UIDs[UID])
                     # save instance UID to rename the filename (so that filename and SOPinstance matches)
                     if UID == 'SOPInstanceUID':
-                        new_filename = anon_id.copy()
+                        new_filename = anon_id
                     dcm_file[UID].value = anon_id
                 # for the other tags, make them anonymous
                 for tag in anon_tags:
@@ -128,9 +124,9 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
                             dcm_file.data_element(tag).value = 0
                         else:
                             dcm_file.data_element(tag).value = 0.0
-                dcm_file.save_as(os.path.join(study_folder, new_filename))
+                dcm_file.save_as(os.path.join(study_folder, new_filename + '.dcm'))
             n += 1
-            print('total files anonymized: {}/{}. Study: {}'.format(n, len(dcm_folders), study_folder), flush=True)
+            print('total folders anonymized: {}/{}. Study: {}'.format(n, len(dcm_folders), study_folder), flush=True)
         except:
             print('Invalid Dicom Error, skipping')
             skip_file = pydicom.dcmread(test_file_path, force=True)
@@ -146,15 +142,14 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
 
 
 if __name__ == "__main__":
-    # ex: 'python anon_pydicom.py /labs/banerjeelab/researchpacs_data/ /labs/banerjeelab/HCC_anon_dcm/200_noForce/'
     data_dir = sys.argv[1]
     output_dir = sys.argv[2]
-    if len(sys.argv) > 2:
+    if len(sys.argv) > 3:
         # stopping number
         stop = int(sys.argv[3])
     else:
         stop = None
     print('Extracting DICOM folders', flush=True)
     dcm_folders = get_dcm_folders(data_dir)
     print('Starting DICOM Study Anonymization', flush=True)
-    dcm_anonymize(dcm_folders, output_dir, stop=stop)
+    dcm_anonymize(dcm_folders, output_dir, stop=None)
diff --git a/modules/dicom-anonymization/README.md b/modules/dicom-anonymization/README.md
@@ -0,0 +1,6 @@
+# The Niffler DICOM Anonymizer
+
+You may convert a DICOM file into an anonymized DICOM file by running 
+```
+python DicomAnonymizer.py <INPUT-DICOMS-FOLDER> <OUTPUT-DICOMS-FOLDER>
+```
diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
@@ -15,6 +15,7 @@
 import numpy as np
 import pandas as pd
 import pydicom as dicom 
+import png 
 #pydicom imports needed to handle data errrors
 from pydicom import config
 from pydicom import datadict
@@ -152,8 +153,7 @@ def extract_images(i):
             ID=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
             folderName = hashlib.sha224(ID.encode('utf-8')).hexdigest()
             #check for existence of patient folder. Create if it does not exist.
-            if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
-                os.mkdir(png_destination + folderName)
+            os.makedirs(png_destination + folderName,exist_ok=True)
         elif flattened_to_level == 'study':
             ID1=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
             try:
@@ -163,8 +163,7 @@ def extract_images(i):
             folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
                          hashlib.sha224(ID2.encode('utf-8')).hexdigest()
             #check for existence of the folder tree patient/study/series. Create if it does not exist.
-            if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
-                os.makedirs(png_destination + folderName)
+            os.makedirs(png_destination + folderName,exist_ok=True)
         else:
             ID1=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
             try:
@@ -176,8 +175,7 @@ def extract_images(i):
             folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
                          hashlib.sha224(ID2.encode('utf-8')).hexdigest() + "/" + hashlib.sha224(ID3.encode('utf-8')).hexdigest()
             #check for existence of the folder tree patient/study/series. Create if it does not exist.
-            if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
-                os.makedirs(png_destination + folderName)
+            os.makedirs(png_destination + folderName,exist_ok=True)
 
 
         pngfile = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
@@ -296,7 +294,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     pickle.dump(filelist,open(pickle_file,'wb'))
 file_chunks = np.array_split(filelist,no_splits)
 logging.info('Number of dicom files: ' + str(len(filelist)))
-logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) )
 
 try:
     ff = filelist[0] #load first file as a template to look at all
@@ -345,19 +342,17 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
         filedata=data
         total = len(chunk)
         stamp = time.time()
-        p = Pool(core_count) 
-        res = p.imap_unordered(extract_images,range(len(filedata)))
-        for out in res:
-            (fmap,fail_path,err) = out
-            if err:
-                count +=1
-                copyfile(fail_path[0],fail_path[1])
-                err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
-                logging.error(err_msg)
-            else:
-                fm.write(fmap)
-        p.join()
-        p.close()
+        with Pool(core_count) as p:
+            res = p.imap_unordered(extract_images,range(len(filedata)))
+            for out in res:
+                (fmap,fail_path,err) = out
+                if err:
+                    count +=1
+                    copyfile(fail_path[0],fail_path[1])
+                    err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
+                    logging.error(err_msg)
+                else:
+                    fm.write(fmap)
     fm.close()
     logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!')