Skip to content

Commit

Permalink
Merge pull request #248 from Emory-HITI/dev
Browse files Browse the repository at this point in the history
Pandas edits to the suvpar module
  • Loading branch information
pradeeban authored Dec 10, 2021
2 parents 81bf6c0 + 1e1e409 commit fabf7b9
Show file tree
Hide file tree
Showing 9 changed files with 190 additions and 30 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Niffler: A DICOM Framework for Machine Learning and Processing Pipelines.

Niffler is an efficient DICOM Framework for machine learning pipelines and processing workflows on metadata. It facilitates efficient transfer of DICOM images on-demand and real-time from PACS to the research environments, to run processing workflows and machine learning pipelines.
Niffler is a lightweight framework to facilitate executing machine learning pipelines and processing workflows on DICOM images and metadata. Niffler facilitates efficient transfer of DICOM images on-demand and real-time from PACS to the research environments. Niffler is also integrated with the radiology information system (RIS) to get clinical data in real-time. The DICOM images from the PACS and clinical data retrieved from the RIS can be used in conjunction in real-time as well as retrospectively on-demand.

The Niffler framework consists of:
- On-demand and real-time retrieval and processing of DICOM images from the PACS environment configured to accept requests from a deployment of Niffler.
- Acquisition and processing of clinical data from a RIS, to enable real-time analytics (RTA).
- Supportive utility functions such as DICOM → PNG conversion, DICOM → NifTi conversion, DICOM anonymization, and a workflow module.
- Sample applications of the Niffler modules.

Niffler enables receiving DICOM images real-time as a data stream from PACS as well as specific DICOM data based on a series of DICOM C-MOV queries. The Niffler real-time DICOM receiver extracts the metadata free of PHI as the images arrive, store the metadata in a Mongo database, and deletes the images nightly. The on-demand extractor reads a CSV file provided by the user (consisting of a list of values for PatientID, AccessionNumber, or other DICOM keywords), and performs a series of DICOM C-MOVE requests to receive them from the PACS, without manually querying them. Niffler also provides additional features such as converting DICOM images into PNG images, and perform additional computations such as computing scanner utilization and finding scanners with misconfigured clocks.

Expand Down
2 changes: 1 addition & 1 deletion modules/app-layer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.13.2</version>
<version>2.15.0</version>
</dependency>
</dependencies>

Expand Down
3 changes: 3 additions & 0 deletions modules/dicom-anonymization/DicomAnonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import sys
import pydicom
import random
import glob
import pathlib
import pickle


Expand All @@ -33,6 +35,7 @@ def get_dcm_folders(dcm_root_dir):
return dcm_flds



# randomly anonymizes the input id
def randomizeID(id):
string = str(id)
Expand Down
128 changes: 128 additions & 0 deletions modules/dicom-anonymization/DicomAnonymizer2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import os
import sys
import pydicom
import random
import glob
import pathlib
import pickle
import string
import random

def get_dcm_paths(dcm_root_dir):
paths = glob.glob(os.path.join(dcm_root_dir, "**/*.dcm"), recursive=True)
return paths

def randomizeID(id):
string = str(id)
splits = string.split('.')
newID = splits[0]
i = 0
for split in splits:
if i == 0:
i += 1
continue
elif len(split) == 1:
newID = '.'.join((newID, split))
continue
num = int(split) + random.randint(0, 9)
newID = '.'.join((newID, str(num)))

return newID

def anonSample(file, idtype, dict):
id = file[idtype].value
if id in dict.keys():
anon_id = dict[id]
else:
if idtype == 'PatientID':
anon_id = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(25))
else:
anon_id = randomizeID(id)
# make sure that the new ID isn't the same as another
while anon_id in dict.values():
anon_id = randomizeID(id)
dict[id] = anon_id

return anon_id


def dcm_anonymize(dcm_files, output_path, stop=None):
# creates dictionaries for the IDs for look up later
samplePatientIDs = {}
sampleStudyInstanceUIDs = {}
sampleSeriesInstanceUID = {}
sampleSOPInstanceUID = {}

UIDs = {'PatientID' : samplePatientIDs,
'StudyInstanceUID': sampleStudyInstanceUIDs,
'SeriesInstanceUID': sampleSeriesInstanceUID,
'SOPInstanceUID': sampleSOPInstanceUID}

# UIDs = pickle.load(open(os.path.join(output_path, "UIDs.pkl"), "rb"))

skipped = []

# tags to anonymize
anon_tags = ['InstanceCreationDate', 'InstanceCreationTime', 'AccessionNumber', 'StudyDate',
'SeriesDate', 'AcquisitionDate', 'ContentDate', 'StudyTime', 'SeriesTime', 'AcquisitionTime',
'ContentTime', 'AccessionNumber', 'InstitutionName', 'InstitutionAddress', 'ReferringPhysicianName',
'PhysiciansOfRecord', 'PerformingPhysicianName', 'OperatorsName', 'PatientName',
'IssuerOfPatientID', 'PatientBirthDate', 'PatientSex', 'OtherPatientIDs', 'PatientAge', 'PatientSize',
'PatientWeight', 'PatientAddress', 'EthnicGroup', 'PregnancyStatus', 'RequestingPhysician',
'PerformedProcedureStepStartDate', 'PerformedProcedureStepStartTime', 'PerformedProcedureStepID']

# for upto 200 dcm folders
n = 0
for file in dcm_files:
try: # if it doesn't
dcm_file = pydicom.dcmread(file)
dcm_file.remove_private_tags()
out_path = output_path
for UID in UIDs.keys():
# get the UID and get the anonymized UID
anon_id = anonSample(dcm_file, UID, UIDs[UID])
dcm_file[UID].value = anon_id
out_path = os.path.join(out_path, anon_id)

out_path+=".dcm"
# save instance UID to rename the filename (so that filename and SOPinstance matches)
# for the other tags, make them anonymous
for tag in anon_tags:
if tag in dcm_file:
if type(dcm_file.data_element(tag).value) == str:
dcm_file.data_element(tag).value = 'N/A'
elif type(dcm_file.data_element(tag).value) == pydicom.uid.UID:
dcm_file.data_element(tag).value = 'N/A'
elif type(dcm_file.data_element(tag).value) == int:
dcm_file.data_element(tag).value = 0
else:
dcm_file.data_element(tag).value = 0.0

pathlib.Path("/".join(out_path.split("/")[:-1])).mkdir(parents=True, exist_ok=True)
dcm_file.save_as(out_path)
n += 1
except:
print('Invalid Dicom Error, skipping')
skip_file = pydicom.dcmread(file, force=True)
skipped.append((skip_file.AccessionNumber, skip_file.StudyInstanceUID))
continue
if n == stop or n == len(dcm_folders):
pickle.dump(UIDs, open(os.path.join(output_path, "UIDs.pkl"), "wb"))
exit()

pickle.dump(UIDs, open(os.path.join(output_path, "UIDs.pkl"), "wb"))
pickle.dump(skipped, open(os.path.join(output_path, "skipped.pkl"), "wb"))


if __name__ == "__main__":
data_dir = sys.argv[1]
output_dir = sys.argv[2]
if len(sys.argv) > 3:
# stopping number
stop = int(sys.argv[3])
else:
stop = None
print('Extracting DICOM folders', flush=True)
dcm_folders = get_dcm_paths(data_dir)
print('Starting DICOM Study Anonymization', flush=True)
dcm_anonymize(dcm_folders, output_dir, stop=None)
4 changes: 4 additions & 0 deletions modules/dicom-anonymization/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@ You may convert a DICOM file into an anonymized DICOM file by running
```
python DicomAnonymizer.py <INPUT-DICOMS-FOLDER> <OUTPUT-DICOMS-FOLDER>
```
To maintain the source diretory hierarchy as well as anonymize and map the PatiendID field you can run
```
python DicomAnonymizer2.py <INPUT-DICOMS-FOLDER> <OUTPUT-DICOMS-FOLDER>
```
3 changes: 2 additions & 1 deletion modules/nifti-extraction/ImageExtractorNifti.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ def extract_images(filedata, i, nifti_destination, flattened_to_level, failed, i
# check for existence of the folder tree patient/study/series. Create if it does not exist.
os.makedirs(nifti_destination + folderName,exist_ok=True)

niftifile = nifti_destination+folderName + '/' + imName + '.nii.gz'

niftifile = nifti_destination+folderName + '/' ID1 +'_' +ID2 +'_' +ID3 + '.nii.gz'
dicom2nifti.dicom_series_to_nifti(str(filedata.iloc[i].loc['file']),niftifile)
filemapping = filedata.iloc[i].loc['file'] + ',' + niftifile + '\n'
except AttributeError as error:
Expand Down
17 changes: 5 additions & 12 deletions modules/nifti-extraction/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# The Niffler PNG Extractor
# The Niffler Nifti Extractor

The PNG Extractor converts a set of DICOM images into png images, extract metadata in a privacy-preserving manner.
The NIFTI Extractor converts a set of DICOM images into NIFTI images, extract metadata in a privacy-preserving manner.


## Configuring Niffler PNG Extractor
Expand All @@ -20,8 +20,6 @@ Find the config.json file in the folder and modify accordingly *for each* Niffle
* *FlattenedToLevel*: Specify how you want your folder tree to be. Default is, "patient" (produces patient/*.png).
You may change this value to "study" (patient/study/*.png) or "series" (patient/study/series/*.png). All IDs are de-identified.

* *is16Bit*: Specifies whether to save extracted image as 16-bit image. By default, this is set to true. Please set it to false to run 8-bit extraction.

* *SendEmail*: Do you want to send an email notification when the extraction completes? The default is true. You may disable this if you do not want to receive an email upon the completion.

* *YourEmail*: Replace "[email protected]" with a valid email if you would like to receive an email notification. If the SendEmail property is disabled, you can leave this as is.
Expand All @@ -42,10 +40,10 @@ The below two fields can be left unmodified for most executions. The default val
$ python3 ImageExtractor.py

# With Nohup
$ nohup python3 ImageExtractor.py > UNIQUE-OUTPUT-FILE-FOR-YOUR-EXTRACTION.out &
$ nohup python3 ImageExtractorNifti.py > UNIQUE-OUTPUT-FILE-FOR-YOUR-EXTRACTION.out &

# With Command Line Arguments
$ nohup python3 ImageExtractor.py --DICOMHome "/opt/data/new-study" --Depth 0 --PrintImages true --SendEmail true > UNIQUE-OUTPUT-FILE-FOR-YOUR-EXTRACTION.out &
$ nohup python3 ImageExtractorNifti.py --DICOMHome "/opt/data/new-study" --Depth 0 --PrintImages true --SendEmail true > UNIQUE-OUTPUT-FILE-FOR-YOUR-EXTRACTION.out &
```
Check that the extraction is going smooth with no errors, by,

Expand All @@ -65,12 +63,7 @@ In the OutputDirectory, there will be several sub folders and directories.

* *extracted-images*: The folder that consists of extracted PNG images

* *failed-dicom*: The folder that consists of the DICOM images that failed to produce the PNG images upon the execution of the Niffler PNG Extractor. Failed DICOM images are stored in 4 sub-folders named 1, 2, 3, and 4, categorizing according to their failure reason.


## Running the Niffler PNG Extractor with Slurm

There is also an experimental PNG extractor implementation (ImageExtractorSlurm.py) that provides a distributed execution based on Slurm on a cluster.
* *failed-dicom*: The folder that consists of the DICOM images that failed to produce the Nifti images upon the execution of the Niffler Extractor. Failed DICOM images are stored in 4 sub-folders named 1, 2, 3, and 4, categorizing according to their failure reason.


## Troubleshooting
Expand Down
32 changes: 20 additions & 12 deletions modules/png-extraction/ImageExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def extract_headers(f_list_elem):
c = False
kv = get_tuples(plan) # gets tuple for field,val pairs for this file. function defined above
# dicom images should not have more than 300 dicom tags
if len(kv)>500:
if len(kv)>300:
logging.debug(str(len(kv)) + " dicom tags produced by " + ff)
kv.append(('file', f_list_elem[1])) # adds my custom field with the original filepath
kv.append(('has_pix_array',c)) # adds my custom field with if file has image
Expand Down Expand Up @@ -205,6 +205,7 @@ def extract_images(filedata, i, png_destination, flattened_to_level, failed, is1
pngfile = png_destination+folderName + '/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
dicom_path = filedata.iloc[i].loc['file']
image_path = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
isRGB = filedata.iloc[i].loc['PhotometricInterpretation'] == 'RGB'
if is16Bit:
# write the PNG file as a 16-bit greyscale
image_2d = ds.pixel_array.astype(np.double)
Expand All @@ -214,7 +215,10 @@ def extract_images(filedata, i, png_destination, flattened_to_level, failed, is1
shape = ds.pixel_array.shape
image_2d_scaled = np.uint16(image_2d_scaled)
with open(pngfile , 'wb') as png_file:
w = png.Writer(shape[1], shape[0], greyscale=True,bitdepth=16)
if isRGB:
w = png.Writer(shape[1], shape[0], greyscale=False,bitdepth=16)
else:
w = png.Writer(shape[1], shape[0], greyscale=True,bitdepth=16)
w.write(png_file, image_2d_scaled)
else:
shape = ds.pixel_array.shape
Expand All @@ -226,7 +230,10 @@ def extract_images(filedata, i, png_destination, flattened_to_level, failed, is1
image_2d_scaled = np.uint8(image_2d_scaled)
# Write the PNG file
with open(pngfile , 'wb') as png_file:
w = png.Writer(shape[1], shape[0], greyscale=True)
if isRGB:
w = png.Writer(shape[1], shape[0], greyscale=False)
else:
w = png.Writer(shape[1], shape[0], greyscale=True)
w.write(png_file, image_2d_scaled)
filemapping = filedata.iloc[i].loc['file'] + ', ' + pngfile + '\n'
except AttributeError as error:
Expand Down Expand Up @@ -379,14 +386,15 @@ def execute(pickle_file, dicom_home, output_directory, print_images, print_only_
total = len(chunk)
stamp = time.time()
for i in range(len(filedata)):
(fmap,fail_path,err) = extract_images(filedata, i, png_destination, flattened_to_level, failed, is16Bit)
if err:
count +=1
copyfile(fail_path[0],fail_path[1])
err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
logging.error(err_msg)
else:
fm.write(fmap)
if (filedata.iloc[i].loc['file'] is not np.nan):
(fmap,fail_path,err) = extract_images(filedata, i, png_destination, flattened_to_level, failed, is16Bit)
if err:
count +=1
copyfile(fail_path[0],fail_path[1])
err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
logging.error(err_msg)
else:
fm.write(fmap)
fm.close()
logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!')

Expand Down Expand Up @@ -432,7 +440,7 @@ def execute(pickle_file, dicom_home, output_directory, print_images, print_only_
merged_meta = pd.concat(meta_list,ignore_index=True)
merged_meta.to_csv('{}/metadata.csv'.format(output_directory),index=False)
# getting a single mapping file
logging.info('Generatign final mapping file')
logging.info('Generating final mapping file')
mappings = glob.glob("{}/maps/*.csv".format(output_directory))
map_list = list()
for mapping in mappings:
Expand Down
23 changes: 20 additions & 3 deletions modules/suvpar/Strip.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,32 @@ def initialize():
feature_list = text_file.read().split('\n')

df = pandas.read_csv(filename, usecols=lambda x: x in feature_list, sep=',')
logging.info(df['ImageType'])


def strip():
global df
# Drop entries without an ImageType, AcquisitionTime, AcquisitionDate, AccessionNumber, or DeviceSerialNumber entry.
df.dropna(subset=["ImageType"], inplace=True)
df.dropna(subset=["AccessionNumber"], inplace=True)
df.dropna(subset=["AcquisitionTime"], inplace=True)
df.dropna(subset=["AcquisitionDate"], inplace=True)
df.dropna(subset=["DeviceSerialNumber"], inplace=True)
# Consider only the ImageType that are ORIGINAL.
df = df[df['ImageType'].str.contains("ORIGINAL")]
# Consider only MR. Remove modalities such as PR and SR that are present in the original data.
df = df[df.Modality == "MR"]
# Consider only the ImageType that are true.
df = df[df['ImageType'].str.contains("ORIGINAL")]
# Ignore milliseconds
df['AcquisitionTime'] = df['AcquisitionDate'].astype(int).astype(str) + \
df['AcquisitionTime'].astype(int).astype(str)
df['AcquisitionTime'] = pandas.to_datetime(df['AcquisitionTime'], format='%Y%m%d%H%M%S')
df = df.join(
df.groupby('AccessionNumber')['AcquisitionTime'].aggregate(['min', 'max']),
on='AccessionNumber')
df.rename(columns={'AcquisitionTime': 'AcquisitionDateTime'}, inplace=True)
df.rename(columns={'min': 'MinAcquisitionDateTime'}, inplace=True)
df.rename(columns={'max': 'MaxAcquisitionDateTime'}, inplace=True)
df = df.drop_duplicates('AccessionNumber')
df = df.drop(columns=['AcquisitionDate'])


def write():
Expand Down

0 comments on commit fabf7b9

Please sign in to comment.