Merge pull request #248 from Emory-HITI/dev

Pandas edits to the suvpar module
Emory-HITI · Dec 10, 2021 · fabf7b9 · fabf7b9
2 parents 81bf6c0 + 1e1e409
commit fabf7b9
Show file tree

Hide file tree

Showing 9 changed files with 190 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,12 @@
 # Niffler: A DICOM Framework for Machine Learning and Processing Pipelines.
 
-Niffler is an efficient DICOM Framework for machine learning pipelines and processing workflows on metadata. It facilitates efficient transfer of DICOM images on-demand and real-time from PACS to the research environments, to run processing workflows and machine learning pipelines.
+Niffler is a lightweight framework to facilitate executing machine learning pipelines and processing workflows on DICOM images and metadata. Niffler facilitates efficient transfer of DICOM images on-demand and real-time from PACS to the research environments. Niffler is also integrated with the radiology information system (RIS) to get clinical data in real-time. The DICOM images from the PACS and clinical data retrieved from the RIS can be used in conjunction in real-time as well as retrospectively on-demand.
+
+The Niffler framework consists of:
+- On-demand and real-time retrieval and processing of DICOM images from the PACS environment configured to accept requests from a deployment of Niffler.
+- Acquisition and processing of clinical data from a RIS, to enable real-time analytics (RTA).
+- Supportive utility functions such as DICOM → PNG conversion, DICOM → NifTi conversion, DICOM anonymization, and a workflow module.
+- Sample applications of the Niffler modules.
 
 Niffler enables receiving DICOM images real-time as a data stream from PACS as well as specific DICOM data based on a series of DICOM C-MOV queries. The Niffler real-time DICOM receiver extracts the metadata free of PHI as the images arrive, store the metadata in a Mongo database, and deletes the images nightly. The on-demand extractor reads a CSV file provided by the user (consisting of a list of values for PatientID, AccessionNumber, or other DICOM keywords), and performs a series of DICOM C-MOVE requests to receive them from the PACS, without manually querying them. Niffler also provides additional features such as converting DICOM images into PNG images, and perform additional computations such as computing scanner utilization and finding scanners with misconfigured clocks.
 

diff --git a/modules/app-layer/pom.xml b/modules/app-layer/pom.xml
@@ -47,7 +47,7 @@
 		<dependency>
 			<groupId>org.apache.logging.log4j</groupId>
 			<artifactId>log4j-core</artifactId>
-			<version>2.13.2</version>
+			<version>2.15.0</version>
 		</dependency>
 	</dependencies>
 

diff --git a/modules/dicom-anonymization/DicomAnonymizer.py b/modules/dicom-anonymization/DicomAnonymizer.py
@@ -10,6 +10,8 @@
 import sys
 import pydicom
 import random
+import glob
+import pathlib
 import pickle
 
 
@@ -33,6 +35,7 @@ def get_dcm_folders(dcm_root_dir):
     return dcm_flds
 
 
+
 # randomly anonymizes the input id
 def randomizeID(id):
     string = str(id)

diff --git a/modules/dicom-anonymization/DicomAnonymizer2.py b/modules/dicom-anonymization/DicomAnonymizer2.py
@@ -0,0 +1,128 @@
+import os
+import sys
+import pydicom
+import random
+import glob
+import pathlib
+import pickle
+import string
+import random 
+
+def get_dcm_paths(dcm_root_dir):
+    paths = glob.glob(os.path.join(dcm_root_dir, "**/*.dcm"), recursive=True)
+    return paths
+
+def randomizeID(id):
+    string = str(id)
+    splits = string.split('.')
+    newID = splits[0]
+    i = 0
+    for split in splits:
+        if i == 0:
+            i += 1
+            continue
+        elif len(split) == 1:
+            newID = '.'.join((newID, split))
+            continue
+        num = int(split) + random.randint(0, 9)
+        newID = '.'.join((newID, str(num)))
+
+    return newID
+
+def anonSample(file, idtype, dict):
+    id = file[idtype].value
+    if id in dict.keys():
+        anon_id = dict[id]
+    else:
+        if idtype == 'PatientID':
+            anon_id = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(25))
+        else:
+            anon_id = randomizeID(id)
+        # make sure that the new ID isn't the same as another
+        while anon_id in dict.values():
+            anon_id = randomizeID(id)
+        dict[id] = anon_id
+
+    return anon_id
+
+
+def dcm_anonymize(dcm_files, output_path, stop=None):
+    # creates dictionaries for the IDs for look up later
+    samplePatientIDs = {}
+    sampleStudyInstanceUIDs = {}
+    sampleSeriesInstanceUID = {}
+    sampleSOPInstanceUID = {}
+
+    UIDs = {'PatientID' : samplePatientIDs,
+            'StudyInstanceUID': sampleStudyInstanceUIDs,
+            'SeriesInstanceUID': sampleSeriesInstanceUID,
+            'SOPInstanceUID': sampleSOPInstanceUID}
+
+    # UIDs = pickle.load(open(os.path.join(output_path, "UIDs.pkl"), "rb"))
+
+    skipped = []
+
+    # tags to anonymize
+    anon_tags = ['InstanceCreationDate', 'InstanceCreationTime', 'AccessionNumber', 'StudyDate',
+                 'SeriesDate', 'AcquisitionDate', 'ContentDate', 'StudyTime', 'SeriesTime', 'AcquisitionTime',
+                 'ContentTime', 'AccessionNumber', 'InstitutionName', 'InstitutionAddress', 'ReferringPhysicianName',
+                 'PhysiciansOfRecord', 'PerformingPhysicianName', 'OperatorsName', 'PatientName', 
+                 'IssuerOfPatientID', 'PatientBirthDate', 'PatientSex', 'OtherPatientIDs', 'PatientAge', 'PatientSize',
+                 'PatientWeight', 'PatientAddress', 'EthnicGroup', 'PregnancyStatus', 'RequestingPhysician',
+                 'PerformedProcedureStepStartDate', 'PerformedProcedureStepStartTime', 'PerformedProcedureStepID']
+
+    # for upto 200 dcm folders
+    n = 0
+    for file in dcm_files:
+        try:  # if it doesn't
+            dcm_file = pydicom.dcmread(file)
+            dcm_file.remove_private_tags()
+            out_path = output_path
+            for UID in UIDs.keys():
+                # get the UID and get the anonymized UID
+                anon_id = anonSample(dcm_file, UID, UIDs[UID])
+                dcm_file[UID].value = anon_id
+                out_path = os.path.join(out_path, anon_id)
+
+            out_path+=".dcm"
+                # save instance UID to rename the filename (so that filename and SOPinstance matches)
+            # for the other tags, make them anonymous
+            for tag in anon_tags:
+                if tag in dcm_file:
+                    if type(dcm_file.data_element(tag).value) == str:
+                        dcm_file.data_element(tag).value = 'N/A'
+                    elif type(dcm_file.data_element(tag).value) == pydicom.uid.UID:
+                        dcm_file.data_element(tag).value = 'N/A'
+                    elif type(dcm_file.data_element(tag).value) == int:
+                        dcm_file.data_element(tag).value = 0
+                    else:
+                        dcm_file.data_element(tag).value = 0.0
+
+                pathlib.Path("/".join(out_path.split("/")[:-1])).mkdir(parents=True, exist_ok=True)
+                dcm_file.save_as(out_path)
+            n += 1
+        except:
+            print('Invalid Dicom Error, skipping')
+            skip_file = pydicom.dcmread(file, force=True)
+            skipped.append((skip_file.AccessionNumber, skip_file.StudyInstanceUID))
+            continue
+        if n == stop or n == len(dcm_folders):
+            pickle.dump(UIDs, open(os.path.join(output_path, "UIDs.pkl"), "wb"))
+            exit()
+
+        pickle.dump(UIDs, open(os.path.join(output_path, "UIDs.pkl"), "wb"))
+        pickle.dump(skipped, open(os.path.join(output_path, "skipped.pkl"), "wb"))
+
+
+if __name__ == "__main__":
+    data_dir = sys.argv[1]
+    output_dir = sys.argv[2]
+    if len(sys.argv) > 3:
+        # stopping number
+        stop = int(sys.argv[3])
+    else:
+        stop = None
+    print('Extracting DICOM folders', flush=True)
+    dcm_folders = get_dcm_paths(data_dir)
+    print('Starting DICOM Study Anonymization', flush=True)
+    dcm_anonymize(dcm_folders, output_dir, stop=None)
diff --git a/modules/dicom-anonymization/README.md b/modules/dicom-anonymization/README.md
@@ -4,3 +4,7 @@ You may convert a DICOM file into an anonymized DICOM file by running
 ```
 python DicomAnonymizer.py <INPUT-DICOMS-FOLDER> <OUTPUT-DICOMS-FOLDER>
 ```
+To maintain the source diretory hierarchy as well as anonymize and map the PatiendID field you can run  
+```
+python DicomAnonymizer2.py <INPUT-DICOMS-FOLDER> <OUTPUT-DICOMS-FOLDER>
+```
diff --git a/modules/nifti-extraction/ImageExtractorNifti.py b/modules/nifti-extraction/ImageExtractorNifti.py
@@ -202,7 +202,8 @@ def extract_images(filedata, i, nifti_destination, flattened_to_level, failed, i
             # check for existence of the folder tree patient/study/series. Create if it does not exist.
             os.makedirs(nifti_destination + folderName,exist_ok=True)
 
-        niftifile = nifti_destination+folderName + '/' + imName + '.nii.gz'
+
+        niftifile = nifti_destination+folderName + '/' ID1 +'_' +ID2 +'_' +ID3 + '.nii.gz'
         dicom2nifti.dicom_series_to_nifti(str(filedata.iloc[i].loc['file']),niftifile)
         filemapping = filedata.iloc[i].loc['file'] + ',' + niftifile + '\n'
     except AttributeError as error:

diff --git a/modules/nifti-extraction/README.md b/modules/nifti-extraction/README.md
@@ -1,6 +1,6 @@
-# The Niffler PNG Extractor
+# The Niffler Nifti Extractor
 
-The PNG Extractor converts a set of DICOM images into png images, extract metadata in a privacy-preserving manner.
+The NIFTI Extractor converts a set of DICOM images into NIFTI images, extract metadata in a privacy-preserving manner.
 
 
 ## Configuring Niffler PNG Extractor
@@ -20,8 +20,6 @@ Find the config.json file in the folder and modify accordingly *for each* Niffle
 * *FlattenedToLevel*: Specify how you want your folder tree to be. Default is, "patient" (produces patient/*.png). 
   You may change this value to "study" (patient/study/*.png) or "series" (patient/study/series/*.png). All IDs are de-identified.
 
-* *is16Bit*:  Specifies whether to save extracted image as 16-bit  image. By default, this is set to true. Please set it to false to run 8-bit extraction.
-
 * *SendEmail*: Do you want to send an email notification when the extraction completes? The default is true. You may disable this if you do not want to receive an email upon the completion.
 
 * *YourEmail*: Replace "[email protected]" with a valid email if you would like to receive an email notification. If the SendEmail property is disabled, you can leave this as is.
@@ -42,10 +40,10 @@ The below two fields can be left unmodified for most executions. The default val
 $ python3 ImageExtractor.py
 
 # With Nohup
-$ nohup python3 ImageExtractor.py > UNIQUE-OUTPUT-FILE-FOR-YOUR-EXTRACTION.out &
+$ nohup python3 ImageExtractorNifti.py > UNIQUE-OUTPUT-FILE-FOR-YOUR-EXTRACTION.out &
 
 # With Command Line Arguments
-$ nohup python3 ImageExtractor.py --DICOMHome "/opt/data/new-study" --Depth 0 --PrintImages true --SendEmail true > UNIQUE-OUTPUT-FILE-FOR-YOUR-EXTRACTION.out &
+$ nohup python3 ImageExtractorNifti.py --DICOMHome "/opt/data/new-study" --Depth 0 --PrintImages true --SendEmail true > UNIQUE-OUTPUT-FILE-FOR-YOUR-EXTRACTION.out &
 ```
 Check that the extraction is going smooth with no errors, by,
 
@@ -65,12 +63,7 @@ In the OutputDirectory, there will be several sub folders and directories.
 
 * *extracted-images*: The folder that consists of extracted PNG images
 
-* *failed-dicom*: The folder that consists of the DICOM images that failed to produce the PNG images upon the execution of the Niffler PNG Extractor. Failed DICOM images are stored in 4 sub-folders named 1, 2, 3, and 4, categorizing according to their failure reason.
-
-
-## Running the Niffler PNG Extractor with Slurm
-
-There is also an experimental PNG extractor implementation (ImageExtractorSlurm.py) that provides a distributed execution based on Slurm on a cluster.
+* *failed-dicom*: The folder that consists of the DICOM images that failed to produce the Nifti images upon the execution of the Niffler  Extractor. Failed DICOM images are stored in 4 sub-folders named 1, 2, 3, and 4, categorizing according to their failure reason.
 
 
 ## Troubleshooting

diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
@@ -145,7 +145,7 @@ def extract_headers(f_list_elem):
         c = False
     kv = get_tuples(plan)       # gets tuple for field,val pairs for this file. function defined above
     # dicom images should not have more than 300 dicom tags
-    if len(kv)>500:
+    if len(kv)>300:
         logging.debug(str(len(kv)) + " dicom tags produced by " + ff)
     kv.append(('file', f_list_elem[1])) # adds my custom field with the original filepath
     kv.append(('has_pix_array',c))   # adds my custom field with if file has image
@@ -205,6 +205,7 @@ def extract_images(filedata, i, png_destination, flattened_to_level, failed, is1
         pngfile = png_destination+folderName + '/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
         dicom_path = filedata.iloc[i].loc['file']
         image_path = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
+        isRGB = filedata.iloc[i].loc['PhotometricInterpretation'] == 'RGB'
         if is16Bit:
             # write the PNG file as a 16-bit greyscale 
             image_2d = ds.pixel_array.astype(np.double) 
@@ -214,7 +215,10 @@ def extract_images(filedata, i, png_destination, flattened_to_level, failed, is1
             shape = ds.pixel_array.shape
             image_2d_scaled = np.uint16(image_2d_scaled) 
             with open(pngfile , 'wb') as png_file:
-                    w = png.Writer(shape[1], shape[0], greyscale=True,bitdepth=16)
+                    if isRGB: 
+                        w = png.Writer(shape[1], shape[0], greyscale=False,bitdepth=16)
+                    else: 
+                        w = png.Writer(shape[1], shape[0], greyscale=True,bitdepth=16)
                     w.write(png_file, image_2d_scaled)
         else: 
             shape = ds.pixel_array.shape
@@ -226,7 +230,10 @@ def extract_images(filedata, i, png_destination, flattened_to_level, failed, is1
             image_2d_scaled = np.uint8(image_2d_scaled)
             # Write the PNG file
             with open(pngfile , 'wb') as png_file:
-                    w = png.Writer(shape[1], shape[0], greyscale=True)
+                    if isRGB: 
+                        w = png.Writer(shape[1], shape[0], greyscale=False)
+                    else: 
+                        w = png.Writer(shape[1], shape[0], greyscale=True)
                     w.write(png_file, image_2d_scaled)
         filemapping = filedata.iloc[i].loc['file'] + ', ' + pngfile + '\n'
     except AttributeError as error:
@@ -379,14 +386,15 @@ def execute(pickle_file, dicom_home, output_directory, print_images, print_only_
             total = len(chunk)
             stamp = time.time()
             for i in range(len(filedata)):
-                (fmap,fail_path,err) = extract_images(filedata, i, png_destination, flattened_to_level, failed, is16Bit)
-                if err:
-                    count +=1
-                    copyfile(fail_path[0],fail_path[1])
-                    err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
-                    logging.error(err_msg)
-                else:
-                    fm.write(fmap)
+                if (filedata.iloc[i].loc['file'] is not np.nan):
+                    (fmap,fail_path,err) = extract_images(filedata, i, png_destination, flattened_to_level, failed, is16Bit)
+                    if err:
+                        count +=1
+                        copyfile(fail_path[0],fail_path[1])
+                        err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
+                        logging.error(err_msg)
+                    else:
+                        fm.write(fmap)
         fm.close()
         logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!')
 
@@ -432,7 +440,7 @@ def execute(pickle_file, dicom_home, output_directory, print_images, print_only_
     merged_meta = pd.concat(meta_list,ignore_index=True)
     merged_meta.to_csv('{}/metadata.csv'.format(output_directory),index=False)
     # getting a single mapping file
-    logging.info('Generatign final mapping file')
+    logging.info('Generating final mapping file')
     mappings = glob.glob("{}/maps/*.csv".format(output_directory))
     map_list = list()
     for mapping in mappings:

diff --git a/modules/suvpar/Strip.py b/modules/suvpar/Strip.py
@@ -20,15 +20,32 @@ def initialize():
     feature_list = text_file.read().split('\n')
 
     df = pandas.read_csv(filename, usecols=lambda x: x in feature_list, sep=',')
-    logging.info(df['ImageType'])
 
 
 def strip():
     global df
+    # Drop entries without an ImageType, AcquisitionTime, AcquisitionDate, AccessionNumber, or DeviceSerialNumber entry.
+    df.dropna(subset=["ImageType"], inplace=True)
+    df.dropna(subset=["AccessionNumber"], inplace=True)
+    df.dropna(subset=["AcquisitionTime"], inplace=True)
+    df.dropna(subset=["AcquisitionDate"], inplace=True)
+    df.dropna(subset=["DeviceSerialNumber"], inplace=True)
+    # Consider only the ImageType that are ORIGINAL.
+    df = df[df['ImageType'].str.contains("ORIGINAL")]
     # Consider only MR. Remove modalities such as PR and SR that are present in the original data.
     df = df[df.Modality == "MR"]
-    # Consider only the ImageType that are true.
-    df = df[df['ImageType'].str.contains("ORIGINAL")]
+    # Ignore milliseconds
+    df['AcquisitionTime'] = df['AcquisitionDate'].astype(int).astype(str) + \
+                            df['AcquisitionTime'].astype(int).astype(str)
+    df['AcquisitionTime'] = pandas.to_datetime(df['AcquisitionTime'], format='%Y%m%d%H%M%S')
+    df = df.join(
+        df.groupby('AccessionNumber')['AcquisitionTime'].aggregate(['min', 'max']),
+        on='AccessionNumber')
+    df.rename(columns={'AcquisitionTime': 'AcquisitionDateTime'}, inplace=True)
+    df.rename(columns={'min': 'MinAcquisitionDateTime'}, inplace=True)
+    df.rename(columns={'max': 'MaxAcquisitionDateTime'}, inplace=True)
+    df = df.drop_duplicates('AccessionNumber')
+    df = df.drop(columns=['AcquisitionDate'])
 
 
 def write():