Skip to content

Commit

Permalink
Merge pull request #122 from Emory-HITI/dev
Browse files Browse the repository at this point in the history
Merge dev to master
  • Loading branch information
pradeeban authored Apr 9, 2021
2 parents 8e1146b + 74707f1 commit 0c32a7d
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 46 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Niffler enables receiving DICOM images real-time as a data stream from PACS as w

# Configure Niffler

Niffler consists of 4 modules, inside the modules folder. Here we will look into the common configuration and installation steps of Niffler. An introduction to Niffler can be found [here](https://emory-hiti.github.io/Niffler/).
Niffler consists of 5 modules, inside the modules folder. Here we will look into the common configuration and installation steps of Niffler. An introduction to Niffler can be found [here](https://emory-hiti.github.io/Niffler/).

## Configure PACS

Expand Down
4 changes: 4 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ Receives DICOM images as a stream from a PACS and extracts and stores the metada

Converts a set of DICOM images into png images, extract metadata in a privacy-preserving manner. The extracted metadata is stored in a CSV file, along with the de-identified PNG images. The mapping of PNG files and their respective metadata is stored in a separate CSV file.

## dicom-anonymization

Converts a set of DICOM images into anonymized DICOM images, stripping off the PHI.

## app-layer

The app-layer (application layer) consists of specific algorithms. The app-layer/src/main/scripts consists of Javascript scripts such as scanner clock calibration. The app-layer/src/main/java consists of the the scanner utilization computation algorithms developed in Java.
Expand Down
33 changes: 19 additions & 14 deletions modules/cold-extraction/ColdDataRetriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,23 +131,28 @@ def initialize():
reader = csv.reader(f)
next(f)
for row in reader:
row = [x.strip() for x in row]
if (extraction_type == 'empi_date'):
patients.append(row[patient_index])
temp_date = row[date_index]
dt_stamp = datetime.datetime.strptime(temp_date, date_format)
date_str = dt_stamp.strftime('%Y%m%d')
dates.append(date_str)
length = len(patients)
if not ((row[patient_index] == "") or (row[date_index] == "")):
patients.append(row[patient_index])
temp_date = row[date_index]
dt_stamp = datetime.datetime.strptime(temp_date, date_format)
date_str = dt_stamp.strftime('%Y%m%d')
dates.append(date_str)
length = len(patients)
elif (extraction_type == 'empi'):
patients.append(row[patient_index])
length = len(patients)
if not ((row[patient_index] == "")):
patients.append(row[patient_index])
length = len(patients)
elif (extraction_type == 'accession'):
accessions.append(row[accession_index])
length = len(accessions)
if not ((row[accession_index] == "")):
accessions.append(row[accession_index])
length = len(accessions)
elif (extraction_type == 'empi_accession'):
patients.append(row[patient_index])
accessions.append(row[accession_index])
length = len(accessions)
if not ((row[patient_index] == "") or (row[accession_index] == "")):
patients.append(row[patient_index])
accessions.append(row[accession_index])
length = len(accessions)


# Run the retrieval only once, when the extraction script starts, and keep it running in a separate thread.
Expand Down Expand Up @@ -277,4 +282,4 @@ def run_threaded(job_func):
time.sleep(1)
except KeyboardInterrupt:
check_kill_process()
sys.exit(0)
sys.exit(0)
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import sys
import pydicom
import random
import shutil
import pickle


Expand Down Expand Up @@ -102,20 +101,17 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
test_file = pydicom.dcmread(test_file_path)
anon_id = anonSample(test_file, 'StudyInstanceUID', UIDs['StudyInstanceUID'])
# make folder with the anonymized studyUID name
print(anon_id)
study_folder = os.path.join(output_path, anon_id)
os.mkdir(study_folder)
for file in files:
# copy the file to the new anon folder
shutil.copyfile(os.path.join(dcm_folder, file), os.path.join(study_folder, file))
dcm_file = pydicom.dcmread(os.path.join(study_folder, file))
dcm_file = pydicom.dcmread(os.path.join(dcm_folder, file))
dcm_file.remove_private_tags()
for UID in UIDs.keys():
# get the UID and get the anonymized UID
anon_id = anonSample(dcm_file, UID, UIDs[UID])
# save instance UID to rename the filename (so that filename and SOPinstance matches)
if UID == 'SOPInstanceUID':
new_filename = anon_id.copy()
new_filename = anon_id
dcm_file[UID].value = anon_id
# for the other tags, make them anonymous
for tag in anon_tags:
Expand All @@ -128,9 +124,9 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
dcm_file.data_element(tag).value = 0
else:
dcm_file.data_element(tag).value = 0.0
dcm_file.save_as(os.path.join(study_folder, new_filename))
dcm_file.save_as(os.path.join(study_folder, new_filename + '.dcm'))
n += 1
print('total files anonymized: {}/{}. Study: {}'.format(n, len(dcm_folders), study_folder), flush=True)
print('total folders anonymized: {}/{}. Study: {}'.format(n, len(dcm_folders), study_folder), flush=True)
except:
print('Invalid Dicom Error, skipping')
skip_file = pydicom.dcmread(test_file_path, force=True)
Expand All @@ -146,15 +142,14 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):


if __name__ == "__main__":
# ex: 'python anon_pydicom.py /labs/banerjeelab/researchpacs_data/ /labs/banerjeelab/HCC_anon_dcm/200_noForce/'
data_dir = sys.argv[1]
output_dir = sys.argv[2]
if len(sys.argv) > 2:
if len(sys.argv) > 3:
# stopping number
stop = int(sys.argv[3])
else:
stop = None
print('Extracting DICOM folders', flush=True)
dcm_folders = get_dcm_folders(data_dir)
print('Starting DICOM Study Anonymization', flush=True)
dcm_anonymize(dcm_folders, output_dir, stop=stop)
dcm_anonymize(dcm_folders, output_dir, stop=None)
6 changes: 6 additions & 0 deletions modules/dicom-anonymization/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# The Niffler DICOM Anonymizer

You may convert a DICOM file into an anonymized DICOM file by running
```
python DicomAnonymizer.py <INPUT-DICOMS-FOLDER> <OUTPUT-DICOMS-FOLDER>
```
35 changes: 15 additions & 20 deletions modules/png-extraction/ImageExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import numpy as np
import pandas as pd
import pydicom as dicom
import png
#pydicom imports needed to handle data errrors
from pydicom import config
from pydicom import datadict
Expand Down Expand Up @@ -152,8 +153,7 @@ def extract_images(i):
ID=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient.
folderName = hashlib.sha224(ID.encode('utf-8')).hexdigest()
#check for existence of patient folder. Create if it does not exist.
if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
os.mkdir(png_destination + folderName)
os.makedirs(png_destination + folderName,exist_ok=True)
elif flattened_to_level == 'study':
ID1=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient.
try:
Expand All @@ -163,8 +163,7 @@ def extract_images(i):
folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
hashlib.sha224(ID2.encode('utf-8')).hexdigest()
#check for existence of the folder tree patient/study/series. Create if it does not exist.
if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
os.makedirs(png_destination + folderName)
os.makedirs(png_destination + folderName,exist_ok=True)
else:
ID1=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient.
try:
Expand All @@ -176,8 +175,7 @@ def extract_images(i):
folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
hashlib.sha224(ID2.encode('utf-8')).hexdigest() + "/" + hashlib.sha224(ID3.encode('utf-8')).hexdigest()
#check for existence of the folder tree patient/study/series. Create if it does not exist.
if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
os.makedirs(png_destination + folderName)
os.makedirs(png_destination + folderName,exist_ok=True)


pngfile = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
Expand Down Expand Up @@ -296,7 +294,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
pickle.dump(filelist,open(pickle_file,'wb'))
file_chunks = np.array_split(filelist,no_splits)
logging.info('Number of dicom files: ' + str(len(filelist)))
logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) )

try:
ff = filelist[0] #load first file as a template to look at all
Expand Down Expand Up @@ -345,19 +342,17 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
filedata=data
total = len(chunk)
stamp = time.time()
p = Pool(core_count)
res = p.imap_unordered(extract_images,range(len(filedata)))
for out in res:
(fmap,fail_path,err) = out
if err:
count +=1
copyfile(fail_path[0],fail_path[1])
err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
logging.error(err_msg)
else:
fm.write(fmap)
p.join()
p.close()
with Pool(core_count) as p:
res = p.imap_unordered(extract_images,range(len(filedata)))
for out in res:
(fmap,fail_path,err) = out
if err:
count +=1
copyfile(fail_path[0],fail_path[1])
err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
logging.error(err_msg)
else:
fm.write(fmap)
fm.close()
logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!')

Expand Down

0 comments on commit 0c32a7d

Please sign in to comment.