Merge pull request #107 from Emory-HITI/dev

Ability to use private tags in real-time DICOM extractor.
Emory-HITI · Mar 2, 2021 · 9c780a5 · 9c780a5
2 parents e31323e + 27ef053
commit 9c780a5
Show file tree

Hide file tree

Showing 8 changed files with 213 additions and 30 deletions.
diff --git a/modules/meta-extraction/MetadataExtractor.py b/modules/meta-extraction/MetadataExtractor.py
@@ -28,7 +28,7 @@
 import json
 
 
-with open('system.json', 'r') as f:
+with open('service/system.json', 'r') as f:
     niffler = json.load(f)
 
 # Get constants from system.json
@@ -206,7 +206,8 @@ def extract_metadata():
                 logging.debug('The file %s is not found', series_path.decode("utf-8"))
             except IndexError:
                 logging.debug('Index error while attempting to access the Series %s', series_path.decode("utf-8"))
-            except:
+            except Exception as e:
+                logging.warn(e)
                 logging.warn('The script could not extract the series %s', series_path.decode("utf-8"))
         logging.info('Metadata Extraction Completed at: %s', str(datetime.datetime.now()))
 
@@ -282,7 +283,7 @@ def run_dcm4che():
         os.chdir(DCM4CHE_BIN)
         subprocess.call("{0}/storescp --accept-unknown --directory {1} --filepath {2} -b {3} > nohup.out &".format(DCM4CHE_BIN, STORAGE_FOLDER, FILE_PATH, QUERY_AET), shell=True)
 
-        logging.info('Stopped DCM4CHE successfully..')
+        logging.info('Started DCM4CHE successfully..')
 
 def run_threaded(job_func):
     job_thread = threading.Thread(target=job_func)

diff --git a/modules/meta-extraction/README.md b/modules/meta-extraction/README.md
@@ -5,14 +5,24 @@ The Real-time DICOM Extractor runs continuously to receive DICOM files, extract
 
 # Configuring Niffler Real-time DICOM Extractor
 
-Niffler real-time extraction must be configured as a service for it to run continuously, and resume even when the server restarts.
+Niffler real-time extraction must be configured as a service for it to run continuously and resume automatically even when the server restarts. Unless you are the administrator who is configuring Niffler for the first time, skip this section.
 
+Find the system.json file in the service folder and modify accordingly.
 
-## Configure DICOM attributes to extract
+system.json entries are to be set *only once* for the Niffler deployment by the administrator. Once set, further extractions do not require a change.
+
+* *DCM4CHEBin*: Set the correct location of the DCM4_CHE folder.
+
+* *QueryAet*: Set the correct AET:PORT of the querying AET (i.e., this script). Typically same as the values you set for the storescp.
 
-Skip this step if you are satisfied with the default attributes provided in the conf folder to extract.
+* *StorageFolder*: Create a folder where you like your DICOM files to be. Usually, this is an empty folder (since each extraction is unique). 
 
-The conf folder consists of several featureset.txt files. Each featureset has multiple attributes. Each featureset corresponds to a collection in the Metadata Store MongoDB database.
+* *FilePath*: By default, "{00100020}/{0020000D}/{0020000E}/{00080018}.dcm". This indicates a hierarchical storage of patients/studies/series/instances.dcm. Leave this value as it is unless you want to change the hierarchy.
+
+
+## Configure DICOM attributes to extract
+
+The conf folder consists of several featureset.txt files. Each featureset has multiple attributes. Each featureset corresponds to a collection in the Metadata Store MongoDB database. Skip this step if you are satisfied with the default attributes provided in the conf folder to extract.
 
 If you desire more DICOM attributes to an existing collection, add the attribute to an existing featureset.txt. Similarly, you may also remove existing attributes from the featureset files. 
 
@@ -21,28 +31,44 @@ If you prefer the additional attributes in a separate collection in the Mongo Me
 
 ## Configure mdextractor service
 
-### Offer execution permission to the mdextractor.sh script.
+The services folder consists of mdextractor.sh, system.json, and mdextractor.service.
+
+mdextractor.sh produces the output in services/niffler-rt.out.
+
+Make sure to provide the correct full path of your meta-extraction folder in the 2nd line of mdextractor.sh, replacing the below:
+
+```
+cd /opt/localdrive/Niffler/modules/meta-extraction/
+```
+
+Offer execution permission to the mdextractor.sh script.
 
 $ chmod +x mdextractor.sh
 
 
-### Check permissions.
+Check permissions.
 
 $ ls -lrt mdextractor.sh
 
 -rwxrwxr-x. 1 pkathi2 pkathi2 332 Aug 15 14:10 mdextractor.sh
 
+Provide the appropriate values for mdextractor.service.
+
+```
+[Service]
+Environment="MONGO_URI=USERNAME:PASSWORD@localhost:27017/"
+Type=simple
+ExecStart=/opt/localdrive/Niffler/modules/meta-extraction/service/mdextractor.sh
+TimeoutStartSec=360
+StandardOutput=/opt/localdrive/Niffler/modules/meta-extraction/service.log
+StandardError=/opt/localdrive/Niffler/modules/meta-extraction/service-error.log
+```
 
 ### Move to systemd
 
 $ sudo cp mdextractor.service /etc/systemd/system/
 
 
-### Add the correct credentials to the mdextractor.service:
-
-Environment="MONGO_URI=USERNAME:PASSWORD@localhost:27017/"
-
-
 ### Reload the systemctl daemon.
 
 $ sudo systemctl daemon-reload
@@ -97,4 +123,4 @@ Check and make sure Mongo Service is running. If not, start it.
 
 Is the disk not full? Check whether the mdextractor service is running. If not, start it.
 
-Is the disk full, and consequently Niffler is unable to receive new images? Stop the mdextractor service if it is running. Empty the storage folder and remove the pickle file. Then, start the mdextractor service again.
+Is the disk full, and consequently Niffler is unable to receive new images? Stop the mdextractor service if it is running. Empty the storage folder and remove the pickle files. Then, start the mdextractor service again.
diff --git a/modules/meta-extraction/conf/featureset.txt b/modules/meta-extraction/conf/featureset.txt
@@ -8,20 +8,13 @@ SeriesTime
 SOPInstanceUID
 AcquisitionDate
 AcquisitionTime
-DurationOfXrayOn
 Exposure
 ExposureTime
 ImageType
 Manufacturer
 ManufacturerModelName
-MidScanFlag
-MidScanTime
 Modality
-ScanFOVType
 ScanOptions
-ScanPitchRatio
-SmartScanOnOffFlag
-StartScanToXrayOnDelay
 StationName
 StudyDescription
 InstitutionName

diff --git a/modules/meta-extraction/conf/featureset1.txt b/modules/meta-extraction/conf/featureset1.txt
@@ -15,7 +15,6 @@ SeriesNumber
 SeriesDescription
 ImageType
 SequenceName
-PulseSeqName
 MRAcquisitionType
 ScanOptions
 ReceiveCoilName
@@ -37,7 +36,6 @@ SpacingBetweenSlices
 NumberOfPhaseEncodingSteps
 EchoTrainLength
 PercentSampling
-PercentPhaseFieldOfView
 PixelBandwidth
 FlipAngle
 SAR
@@ -46,3 +44,6 @@ Manufacturer
 ManufacturerModelName
 0x0051100F
 0x0051100C
+0x00090010
+0x0019100B
+0x0019105A
diff --git a/modules/meta-extraction/pickles/README.md b/modules/meta-extraction/pickles/README.md
@@ -0,0 +1 @@
+The folder with the pickle files. These files trace the progress.
diff --git a/modules/meta-extraction/service/mdextractor.service b/modules/meta-extraction/service/mdextractor.service
@@ -6,10 +6,10 @@ After=network.target
 [Service]
 Environment="MONGO_URI=USERNAME:PASSWORD@localhost:27017/"
 Type=simple
-ExecStart=/opt/localdrive/researchpacs/src/meta-extraction/service/mdextractor.sh
+ExecStart=/opt/localdrive/Niffler/modules/meta-extraction/service/mdextractor.sh
 TimeoutStartSec=360
-StandardOutput=/opt/localdrive/researchpacs/service.log
-StandardError=/opt/localdrive/researchpacs/service-error.log
+StandardOutput=/opt/localdrive/Niffler/modules/meta-extraction/service.log
+StandardError=/opt/localdrive/Niffler/modules/meta-extraction/service-error.log
 
 [Install]
 WantedBy=default.target
diff --git a/modules/meta-extraction/service/mdextractor.sh b/modules/meta-extraction/service/mdextractor.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-nohup python3.6 -u /opt/localdrive/researchpacs/src/meta-extraction/MetadataExtractor.py >> /opt/localdrive/researchpacs/py.out &
+cd /opt/localdrive/Niffler/modules/meta-extraction/
+nohup python3.6 -u MetadataExtractor.py >> niffler-rt.out &
 wait
-echo "The Researchpacs Metadata Extractor Process has failed" >> /opt/localdrive/researchpacs/py.out
-echo "The Researchpacs Metadata Extractor Process has failed" | mail -s "The Researchpacs Metadata Extractor Process has failed" [email protected]
+echo "The Niffler Metadata Extractor Process has failed" >> niffler-rt.out
+echo "The Niffler Metadata Extractor Process has failed" | mail -s "The Niffler Metadata Extractor Process has failed" [email protected]
diff --git a/modules/png-extraction/anon_pydicom.py b/modules/png-extraction/anon_pydicom.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+"""To do list:
+- some history of anonymization (to continue at later time)
+- maybe some GUI (tkinter) to make it easier to use
+"""
+
+import os
+import sys
+import pydicom
+import random
+import shutil
+import pickle
+
+
+def get_dcm_folders(dcm_root_dir):
+    # get all folders
+    print('getting all dcm folders')
+    # speeding it up by checking files within each folder at the start
+    dcm_flds = []
+    for x in os.walk(dcm_root_dir):
+        folder = x[0]
+        # rejecting some folders that isn't in the accession list (HCC specific)
+        if 'bk' in folder or 'March5' in folder:
+            continue
+        try:
+            # assumes that all files in the study folder is dcm files
+            if 'dcm' in os.listdir(folder)[0]:
+                dcm_flds.append(folder)
+        except:
+            print('no dcm files in folder, skipping')
+            continue
+    return dcm_flds
+
+
+# randomly anonymizes the input id
+def randomizeID(id):
+    string = str(id)
+    splits = string.split('.')
+    newID = splits[0]
+    i = 0
+    for split in splits:
+        if i == 0:
+            i += 1
+            continue
+        elif len(split) == 1:
+            newID = '.'.join((newID, split))
+            continue
+        num = int(split) + random.randint(0, 9)
+        newID = '.'.join((newID, str(num)))
+
+    return newID
+
+
+# uniquely anonymizes the ID and keeps in dictionary for lookup
+def anonSample(file, idtype, dict):
+    id = file[idtype].value
+    if id in dict.keys():
+        anon_id = dict[id]
+    else:
+        anon_id = randomizeID(id)
+        # make sure that the new ID isn't the same as another
+        while anon_id in dict.values():
+            anon_id = randomizeID(id)
+        dict[id] = anon_id
+
+    return anon_id
+
+
+def dcm_anonymize(dcm_folders, output_path, stop=None):
+    # creates dictionaries for the IDs for look up later
+    sampleStudyInstanceUIDs = {}
+    sampleSeriesInstanceUID = {}
+    sampleSOPInstanceUID = {}
+
+    UIDs = {'StudyInstanceUID': sampleStudyInstanceUIDs,
+            'SeriesInstanceUID': sampleSeriesInstanceUID,
+            'SOPInstanceUID': sampleSOPInstanceUID}
+
+    # UIDs = pickle.load(open(os.path.join(output_path, "UIDs.pkl"), "rb"))
+
+    skipped = []
+
+    # tags to anonymize
+    anon_tags = ['InstanceCreationDate', 'InstanceCreationTime', 'AccessionNumber', 'StudyDate',
+                 'SeriesDate', 'AcquisitionDate', 'ContentDate', 'StudyTime', 'SeriesTime', 'AcquisitionTime',
+                 'ContentTime', 'AccessionNumber', 'InstitutionName', 'InstitutionAddress', 'ReferringPhysicianName',
+                 'PhysiciansOfRecord', 'PerformingPhysicianName', 'OperatorsName', 'PatientName', 'PatientID',
+                 'IssuerOfPatientID', 'PatientBirthDate', 'PatientSex', 'OtherPatientIDs', 'PatientAge', 'PatientSize',
+                 'PatientWeight', 'PatientAddress', 'EthnicGroup', 'PregnancyStatus', 'RequestingPhysician',
+                 'PerformedProcedureStepStartDate', 'PerformedProcedureStepStartTime', 'PerformedProcedureStepID']
+
+    # for upto 200 dcm folders
+    n = 0
+    for dcm_folder in dcm_folders:
+        files = [f for f in os.listdir(dcm_folder)]
+        test_file_path = os.path.join(dcm_folder, files[random.randint(0, len(files) - 1)])
+        print('testing folder: {}'.format(dcm_folder))
+        # check if dcm folder has the invalid dicom error or not (no forced dicom file reading)
+        try:  # if it doesn't
+            test_file = pydicom.dcmread(test_file_path)
+            anon_id = anonSample(test_file, 'StudyInstanceUID', UIDs['StudyInstanceUID'])
+            # make folder with the anonymized studyUID name
+            print(anon_id)
+            study_folder = os.path.join(output_path, anon_id)
+            os.mkdir(study_folder)
+            for file in files:
+                # copy the file to the new anon folder
+                shutil.copyfile(os.path.join(dcm_folder, file), os.path.join(study_folder, file))
+                dcm_file = pydicom.dcmread(os.path.join(study_folder, file))
+                dcm_file.remove_private_tags()
+                for UID in UIDs.keys():
+                    # get the UID and get the anonymized UID
+                    anon_id = anonSample(dcm_file, UID, UIDs[UID])
+                    # save instance UID to rename the filename (so that filename and SOPinstance matches)
+                    if UID == 'SOPInstanceUID':
+                        new_filename = anon_id.copy()
+                    dcm_file[UID].value = anon_id
+                # for the other tags, make them anonymous
+                for tag in anon_tags:
+                    if tag in dcm_file:
+                        if type(dcm_file.data_element(tag).value) == str:
+                            dcm_file.data_element(tag).value = 'N/A'
+                        elif type(dcm_file.data_element(tag).value) == pydicom.uid.UID:
+                            dcm_file.data_element(tag).value = 'N/A'
+                        elif type(dcm_file.data_element(tag).value) == int:
+                            dcm_file.data_element(tag).value = 0
+                        else:
+                            dcm_file.data_element(tag).value = 0.0
+                dcm_file.save_as(os.path.join(study_folder, new_filename))
+            n += 1
+            print('total files anonymized: {}/{}. Study: {}'.format(n, len(dcm_folders), study_folder), flush=True)
+        except:
+            print('Invalid Dicom Error, skipping')
+            skip_file = pydicom.dcmread(test_file_path, force=True)
+            skipped.append((skip_file.AccessionNumber, skip_file.StudyInstanceUID))
+            continue
+        if n == stop or n == len(dcm_folders):
+            pickle.dump(UIDs, open(os.path.join(output_path, "UIDs.pkl"), "wb"))
+            print('anonymized {} samples, exiting.'.format(stop), flush=True)
+            exit()
+
+        pickle.dump(UIDs, open(os.path.join(output_path, "UIDs.pkl"), "wb"))
+        pickle.dump(skipped, open(os.path.join(output_path, "skipped.pkl"), "wb"))
+
+
+if __name__ == "__main__":
+    # ex: 'python anon_pydicom.py /labs/banerjeelab/researchpacs_data/ /labs/banerjeelab/HCC_anon_dcm/200_noForce/'
+    data_dir = sys.argv[1]
+    output_dir = sys.argv[2]
+    if len(sys.argv) > 2:
+        # stopping number
+        stop = int(sys.argv[3])
+    else:
+        stop = None
+    print('Extracting DICOM folders', flush=True)
+    dcm_folders = get_dcm_folders(data_dir)
+    print('Starting DICOM Study Anonymization', flush=True)
+    dcm_anonymize(dcm_folders, output_dir, stop=stop)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The folder with the pickle files. These files trace the progress.