From 87d894b681444d8120241a9a3e041040f1b6c476 Mon Sep 17 00:00:00 2001 From: Nishchal-007 Date: Tue, 30 Mar 2021 20:25:43 +0530 Subject: [PATCH 01/20] missing csv entries --- modules/cold-extraction/ColdDataRetriever.py | 60 ++++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py index 24c863c..d358375 100644 --- a/modules/cold-extraction/ColdDataRetriever.py +++ b/modules/cold-extraction/ColdDataRetriever.py @@ -130,24 +130,40 @@ def initialize(): with open(csv_file, newline='') as f: reader = csv.reader(f) next(f) + + # Changed below part for finding missing csv entries and skipping them for row in reader: + row = [x.strip() for x in row] + #print(row) if (extraction_type == 'empi_date'): - patients.append(row[patient_index]) - temp_date = row[date_index] - dt_stamp = datetime.datetime.strptime(temp_date, date_format) - date_str = dt_stamp.strftime('%Y%m%d') - dates.append(date_str) - length = len(patients) + if set(row).pop()=='': + pass + else: + patients.append(row[patient_index]) + temp_date = row[date_index] + dt_stamp = datetime.datetime.strptime(temp_date, date_format) + date_str = dt_stamp.strftime('%Y%m%d') + dates.append(date_str) + length = len(patients) elif (extraction_type == 'empi'): - patients.append(row[patient_index]) - length = len(patients) + if set(row).pop()=='': + pass + else: + patients.append(row[patient_index]) + length = len(patients) elif (extraction_type == 'accession'): - accessions.append(row[accession_index]) - length = len(accessions) + if set(row).pop()=='': + pass + else: + accessions.append(row[accession_index]) + length = len(accessions) elif (extraction_type == 'empi_accession'): - patients.append(row[patient_index]) - accessions.append(row[accession_index]) - length = len(accessions) + if set(row).pop()=='': + pass + else: + patients.append(row[patient_index]) + accessions.append(row[accession_index]) + length = len(accessions) # Run the retrieval only once, when the extraction script starts, and keep it running in a separate thread. @@ -180,22 +196,6 @@ def retrieve(): subprocess.call("{0}/movescu -c {1} -b {2} -M PatientRoot -m PatientID={3} -m AccessionNumber={4} --dest {5}".format(DCM4CHE_BIN, SRC_AET, QUERY_AET, PatientID, Accession, DEST_AET), shell=True) extracted_ones.append(temp_id) - # For the cases that have the EMPI. - elif (extraction_type == 'empi'): - # Create our Identifier (query) dataset - for pid in range(0, len(patients)): - PatientID = patients[pid] - if NIGHTLY_ONLY: - if (datetime.datetime.now().hour >= END_HOUR and datetime.datetime.now().hour < START_HOUR): - # log once while sleeping - logging.info("Nightly mode. Niffler schedules the extraction to resume at start hour {0} and start within 30 minutes after that. It will then pause at the end hour {1}".format(START_HOUR, END_HOUR)) - while (datetime.datetime.now().hour >= END_HOUR and datetime.datetime.now().hour < START_HOUR): - # sleep for 5 minutes - time.sleep(300) - if ((not resume) or (resume and (PatientID not in extracted_ones))): - subprocess.call("{0}/movescu -c {1} -b {2} -M PatientRoot -m PatientID={3} --dest {4}".format(DCM4CHE_BIN, SRC_AET, QUERY_AET, PatientID, DEST_AET), shell=True) - extracted_ones.append(PatientID) - # For the cases that does not have the typical EMPI and Accession values together. elif (extraction_type == 'empi_date' or extraction_type == 'accession'): # Create our Identifier (query) dataset @@ -277,4 +277,4 @@ def run_threaded(job_func): time.sleep(1) except KeyboardInterrupt: check_kill_process() - sys.exit(0) \ No newline at end of file + sys.exit(0) From 45cf9afd004dbf9c08ff9589dc2f57aba3298c6f Mon Sep 17 00:00:00 2001 From: Nishchal Singi <71981858+Nishchal-007@users.noreply.github.com> Date: Tue, 30 Mar 2021 22:03:25 +0530 Subject: [PATCH 02/20] removed comment and added missing code --- modules/cold-extraction/ColdDataRetriever.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py index d358375..594a8eb 100644 --- a/modules/cold-extraction/ColdDataRetriever.py +++ b/modules/cold-extraction/ColdDataRetriever.py @@ -130,11 +130,10 @@ def initialize(): with open(csv_file, newline='') as f: reader = csv.reader(f) next(f) - + # Changed below part for finding missing csv entries and skipping them for row in reader: row = [x.strip() for x in row] - #print(row) if (extraction_type == 'empi_date'): if set(row).pop()=='': pass @@ -196,6 +195,22 @@ def retrieve(): subprocess.call("{0}/movescu -c {1} -b {2} -M PatientRoot -m PatientID={3} -m AccessionNumber={4} --dest {5}".format(DCM4CHE_BIN, SRC_AET, QUERY_AET, PatientID, Accession, DEST_AET), shell=True) extracted_ones.append(temp_id) + # For the cases that have the EMPI. + elif (extraction_type == 'empi'): + # Create our Identifier (query) dataset + for pid in range(0, len(patients)): + PatientID = patients[pid] + if NIGHTLY_ONLY: + if (datetime.datetime.now().hour >= END_HOUR and datetime.datetime.now().hour < START_HOUR): + # log once while sleeping + logging.info("Nightly mode. Niffler schedules the extraction to resume at start hour {0} and start within 30 minutes after that. It will then pause at the end hour {1}".format(START_HOUR, END_HOUR)) + while (datetime.datetime.now().hour >= END_HOUR and datetime.datetime.now().hour < START_HOUR): + # sleep for 5 minutes + time.sleep(300) + if ((not resume) or (resume and (PatientID not in extracted_ones))): + subprocess.call("{0}/movescu -c {1} -b {2} -M PatientRoot -m PatientID={3} --dest {4}".format(DCM4CHE_BIN, SRC_AET, QUERY_AET, PatientID, DEST_AET), shell=True) + extracted_ones.append(PatientID) + # For the cases that does not have the typical EMPI and Accession values together. elif (extraction_type == 'empi_date' or extraction_type == 'accession'): # Create our Identifier (query) dataset From 3ef6f100581c0fbf43a6152aa238427b71f5b3f7 Mon Sep 17 00:00:00 2001 From: Nishchal-007 Date: Thu, 1 Apr 2021 18:28:25 +0530 Subject: [PATCH 03/20] Updated ColdDataRetriever.py --- modules/cold-extraction/ColdDataRetriever.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py index 594a8eb..10c2176 100644 --- a/modules/cold-extraction/ColdDataRetriever.py +++ b/modules/cold-extraction/ColdDataRetriever.py @@ -135,7 +135,7 @@ def initialize(): for row in reader: row = [x.strip() for x in row] if (extraction_type == 'empi_date'): - if set(row).pop()=='': + if ((row[patient_index] == "") or (row[date_index] == "")): pass else: patients.append(row[patient_index]) @@ -145,19 +145,19 @@ def initialize(): dates.append(date_str) length = len(patients) elif (extraction_type == 'empi'): - if set(row).pop()=='': + if ((row[patient_index] == "")): pass else: patients.append(row[patient_index]) length = len(patients) elif (extraction_type == 'accession'): - if set(row).pop()=='': + if ((row[accession_index] == "")): pass else: accessions.append(row[accession_index]) length = len(accessions) elif (extraction_type == 'empi_accession'): - if set(row).pop()=='': + if ((row[patient_index] == "") or (row[accession_index] == "")): pass else: patients.append(row[patient_index]) From 6ce407f0eeec05db285c244eb89e248007c85a9e Mon Sep 17 00:00:00 2001 From: jeong-jasonji <55253180+jeong-jasonji@users.noreply.github.com> Date: Thu, 1 Apr 2021 10:16:46 -0400 Subject: [PATCH 04/20] little update - filename matches the anonymized SOPInstanceUID - requires pydicom 1.4.2 or above... --- modules/png-extraction/anon_pydicom.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/modules/png-extraction/anon_pydicom.py b/modules/png-extraction/anon_pydicom.py index b1ae396..3595d43 100644 --- a/modules/png-extraction/anon_pydicom.py +++ b/modules/png-extraction/anon_pydicom.py @@ -10,7 +10,6 @@ import sys import pydicom import random -import shutil import pickle @@ -102,20 +101,17 @@ def dcm_anonymize(dcm_folders, output_path, stop=None): test_file = pydicom.dcmread(test_file_path) anon_id = anonSample(test_file, 'StudyInstanceUID', UIDs['StudyInstanceUID']) # make folder with the anonymized studyUID name - print(anon_id) study_folder = os.path.join(output_path, anon_id) os.mkdir(study_folder) for file in files: - # copy the file to the new anon folder - shutil.copyfile(os.path.join(dcm_folder, file), os.path.join(study_folder, file)) - dcm_file = pydicom.dcmread(os.path.join(study_folder, file)) + dcm_file = pydicom.dcmread(os.path.join(dcm_folder, file)) dcm_file.remove_private_tags() for UID in UIDs.keys(): # get the UID and get the anonymized UID anon_id = anonSample(dcm_file, UID, UIDs[UID]) # save instance UID to rename the filename (so that filename and SOPinstance matches) if UID == 'SOPInstanceUID': - new_filename = anon_id.copy() + new_filename = anon_id dcm_file[UID].value = anon_id # for the other tags, make them anonymous for tag in anon_tags: @@ -128,9 +124,9 @@ def dcm_anonymize(dcm_folders, output_path, stop=None): dcm_file.data_element(tag).value = 0 else: dcm_file.data_element(tag).value = 0.0 - dcm_file.save_as(os.path.join(study_folder, new_filename)) + dcm_file.save_as(os.path.join(study_folder, new_filename + '.dcm')) n += 1 - print('total files anonymized: {}/{}. Study: {}'.format(n, len(dcm_folders), study_folder), flush=True) + print('total folders anonymized: {}/{}. Study: {}'.format(n, len(dcm_folders), study_folder), flush=True) except: print('Invalid Dicom Error, skipping') skip_file = pydicom.dcmread(test_file_path, force=True) @@ -147,9 +143,10 @@ def dcm_anonymize(dcm_folders, output_path, stop=None): if __name__ == "__main__": # ex: 'python anon_pydicom.py /labs/banerjeelab/researchpacs_data/ /labs/banerjeelab/HCC_anon_dcm/200_noForce/' + # 'python anon_pydicom.py r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler' r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_out'' data_dir = sys.argv[1] output_dir = sys.argv[2] - if len(sys.argv) > 2: + if len(sys.argv) > 3: # stopping number stop = int(sys.argv[3]) else: @@ -157,4 +154,8 @@ def dcm_anonymize(dcm_folders, output_path, stop=None): print('Extracting DICOM folders', flush=True) dcm_folders = get_dcm_folders(data_dir) print('Starting DICOM Study Anonymization', flush=True) - dcm_anonymize(dcm_folders, output_dir, stop=stop) + dcm_anonymize(dcm_folders, output_dir, stop=None) + + +data_dir = r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_files' +output_dir = r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_out' \ No newline at end of file From 92e0d594e9fe7bdb27b73bc742b93314d2897554 Mon Sep 17 00:00:00 2001 From: jeong-jasonji <55253180+jeong-jasonji@users.noreply.github.com> Date: Thu, 1 Apr 2021 13:59:23 -0400 Subject: [PATCH 05/20] removed hardcode --- modules/png-extraction/anon_pydicom.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/png-extraction/anon_pydicom.py b/modules/png-extraction/anon_pydicom.py index 3595d43..272c2cd 100644 --- a/modules/png-extraction/anon_pydicom.py +++ b/modules/png-extraction/anon_pydicom.py @@ -155,7 +155,3 @@ def dcm_anonymize(dcm_folders, output_path, stop=None): dcm_folders = get_dcm_folders(data_dir) print('Starting DICOM Study Anonymization', flush=True) dcm_anonymize(dcm_folders, output_dir, stop=None) - - -data_dir = r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_files' -output_dir = r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_out' \ No newline at end of file From 4e1a602505684ce14c85f4f693e9b1e46399a02c Mon Sep 17 00:00:00 2001 From: Nishchal Singi <71981858+Nishchal-007@users.noreply.github.com> Date: Sat, 3 Apr 2021 19:33:51 +0530 Subject: [PATCH 06/20] Made required changes --- modules/cold-extraction/ColdDataRetriever.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py index 10c2176..9f1e2d3 100644 --- a/modules/cold-extraction/ColdDataRetriever.py +++ b/modules/cold-extraction/ColdDataRetriever.py @@ -130,14 +130,10 @@ def initialize(): with open(csv_file, newline='') as f: reader = csv.reader(f) next(f) - - # Changed below part for finding missing csv entries and skipping them for row in reader: row = [x.strip() for x in row] if (extraction_type == 'empi_date'): - if ((row[patient_index] == "") or (row[date_index] == "")): - pass - else: + if not ((row[patient_index] == "") or (row[date_index] == "")): patients.append(row[patient_index]) temp_date = row[date_index] dt_stamp = datetime.datetime.strptime(temp_date, date_format) @@ -145,21 +141,15 @@ def initialize(): dates.append(date_str) length = len(patients) elif (extraction_type == 'empi'): - if ((row[patient_index] == "")): - pass - else: + if not ((row[patient_index] == "")): patients.append(row[patient_index]) length = len(patients) elif (extraction_type == 'accession'): - if ((row[accession_index] == "")): - pass - else: + if not ((row[accession_index] == "")): accessions.append(row[accession_index]) length = len(accessions) elif (extraction_type == 'empi_accession'): - if ((row[patient_index] == "") or (row[accession_index] == "")): - pass - else: + if not ((row[patient_index] == "") or (row[accession_index] == "")): patients.append(row[patient_index]) accessions.append(row[accession_index]) length = len(accessions) From eab58a3f8bcd0f8826d9416bb1ca65efe97c2e78 Mon Sep 17 00:00:00 2001 From: Pradeeban Kathiravelu Date: Sat, 3 Apr 2021 21:52:07 -0400 Subject: [PATCH 07/20] Move anon_pydicom to a new module --- modules/{png-extraction => dicom-anonymization}/anon_pydicom.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename modules/{png-extraction => dicom-anonymization}/anon_pydicom.py (100%) diff --git a/modules/png-extraction/anon_pydicom.py b/modules/dicom-anonymization/anon_pydicom.py similarity index 100% rename from modules/png-extraction/anon_pydicom.py rename to modules/dicom-anonymization/anon_pydicom.py From 5f75125126efe9620fc4697bad2dd8b639e93694 Mon Sep 17 00:00:00 2001 From: Pradeeban Kathiravelu Date: Sat, 3 Apr 2021 22:46:38 -0400 Subject: [PATCH 08/20] Anonymize Dicom files --- .../dicom-anonymization/{anon_pydicom.py => DicomAnonymizer.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename modules/dicom-anonymization/{anon_pydicom.py => DicomAnonymizer.py} (100%) diff --git a/modules/dicom-anonymization/anon_pydicom.py b/modules/dicom-anonymization/DicomAnonymizer.py similarity index 100% rename from modules/dicom-anonymization/anon_pydicom.py rename to modules/dicom-anonymization/DicomAnonymizer.py From ef910b0765d872d47c83977c6227d7cb7459e640 Mon Sep 17 00:00:00 2001 From: Pradeeban Kathiravelu Date: Sat, 3 Apr 2021 22:47:52 -0400 Subject: [PATCH 09/20] Update DicomAnonymizer.py Remove comment --- modules/dicom-anonymization/DicomAnonymizer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/dicom-anonymization/DicomAnonymizer.py b/modules/dicom-anonymization/DicomAnonymizer.py index 272c2cd..b4fdaae 100644 --- a/modules/dicom-anonymization/DicomAnonymizer.py +++ b/modules/dicom-anonymization/DicomAnonymizer.py @@ -142,8 +142,6 @@ def dcm_anonymize(dcm_folders, output_path, stop=None): if __name__ == "__main__": - # ex: 'python anon_pydicom.py /labs/banerjeelab/researchpacs_data/ /labs/banerjeelab/HCC_anon_dcm/200_noForce/' - # 'python anon_pydicom.py r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler' r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_out'' data_dir = sys.argv[1] output_dir = sys.argv[2] if len(sys.argv) > 3: From cfc02822ca292ba75a869f3d1e658b4d4bd191c2 Mon Sep 17 00:00:00 2001 From: Pradeeban Kathiravelu Date: Sat, 3 Apr 2021 22:49:57 -0400 Subject: [PATCH 10/20] Create README.md --- modules/dicom-anonymization/README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 modules/dicom-anonymization/README.md diff --git a/modules/dicom-anonymization/README.md b/modules/dicom-anonymization/README.md new file mode 100644 index 0000000..e990d49 --- /dev/null +++ b/modules/dicom-anonymization/README.md @@ -0,0 +1,6 @@ +# The Niffler DICOM Anonymizer + +You may convert a DICOM file into an anonymized DICOM file by running +``` +python DicomAnonymizer.py +``` From c0039b03c1654e23587a8937c08b2c3d1204d40c Mon Sep 17 00:00:00 2001 From: Pradeeban Kathiravelu Date: Sat, 3 Apr 2021 22:51:27 -0400 Subject: [PATCH 11/20] Update README.md Fix module reference --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 57e673f..69b01f1 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Niffler enables receiving DICOM images real-time as a data stream from PACS as w # Configure Niffler -Niffler consists of 4 modules, inside the modules folder. Here we will look into the common configuration and installation steps of Niffler. An introduction to Niffler can be found [here](https://emory-hiti.github.io/Niffler/). +Niffler consists of 5 modules, inside the modules folder. Here we will look into the common configuration and installation steps of Niffler. An introduction to Niffler can be found [here](https://emory-hiti.github.io/Niffler/). ## Configure PACS From 548ba59b5696b05012ef944384c7d1a6494bee25 Mon Sep 17 00:00:00 2001 From: Pradeeban Kathiravelu Date: Sat, 3 Apr 2021 22:52:54 -0400 Subject: [PATCH 12/20] Update index.md --- docs/index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/index.md b/docs/index.md index 8c28baa..e7f8165 100644 --- a/docs/index.md +++ b/docs/index.md @@ -21,6 +21,10 @@ Receives DICOM images as a stream from a PACS and extracts and stores the metada Converts a set of DICOM images into png images, extract metadata in a privacy-preserving manner. The extracted metadata is stored in a CSV file, along with the de-identified PNG images. The mapping of PNG files and their respective metadata is stored in a separate CSV file. +## dicom-anonymization + +Converts a set of DICOM images into anonymized DICOM images, stripping off the PHI. + ## app-layer The app-layer (application layer) consists of specific algorithms. The app-layer/src/main/scripts consists of Javascript scripts such as scanner clock calibration. The app-layer/src/main/java consists of the the scanner utilization computation algorithms developed in Java. From 03168977ce7d9173fb35eb179a71714d44cd0747 Mon Sep 17 00:00:00 2001 From: Pradeeban Kathiravelu Date: Mon, 5 Apr 2021 16:44:54 -0400 Subject: [PATCH 13/20] Update ImageExtractor.py Remove misleading log. --- modules/png-extraction/ImageExtractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py index 5e22212..db57c8a 100644 --- a/modules/png-extraction/ImageExtractor.py +++ b/modules/png-extraction/ImageExtractor.py @@ -298,7 +298,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): pickle.dump(filelist,open(pickle_file,'wb')) file_chunks = np.array_split(filelist,no_splits) logging.info('Number of dicom files: ' + str(len(filelist))) -logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) ) try: ff = filelist[0] #load first file as a template to look at all From a3bd039c4f35aa060b602256bab572ba1eb77906 Mon Sep 17 00:00:00 2001 From: Ramon Luis Correa Medero Date: Mon, 5 Apr 2021 17:05:39 -0400 Subject: [PATCH 14/20] updated cpu count call and changed how pool is managed --- modules/png-extraction/ImageExtractor.py | 27 ++++++++++++------------ 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py index db57c8a..8d33b1e 100644 --- a/modules/png-extraction/ImageExtractor.py +++ b/modules/png-extraction/ImageExtractor.py @@ -1,26 +1,24 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import numpy as np -import pandas as pd -import pydicom as dicom -import png, os, glob -import PIL as pil -from pprint import pprint -import hashlib +import os +import glob from shutil import copyfile -import logging -from multiprocessing import Pool +import hashlib import json import sys import subprocess +import logging +from multiprocessing import Pool import pdb +import time import pickle +import numpy as np +import pandas as pd +import pydicom as dicom #pydicom imports needed to handle data errrors from pydicom import config from pydicom import datadict from pydicom import values -from subprocess import Popen -import time with open('config.json', 'r') as f: niffler = json.load(f) @@ -243,7 +241,7 @@ def fix_mismatch_callback(raw_elem, **kwargs): pass else: raw_elem = raw_elem._replace(VR=vr) - break # I want to exit immediately after change is applied + break return raw_elem @@ -298,6 +296,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): pickle.dump(filelist,open(pickle_file,'wb')) file_chunks = np.array_split(filelist,no_splits) logging.info('Number of dicom files: ' + str(len(filelist))) +logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) ) try: ff = filelist[0] #load first file as a template to look at all @@ -346,7 +345,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): filedata=data total = len(chunk) stamp = time.time() - p = Pool(os.cpu_count()) + p = Pool(core_count) res = p.imap_unordered(extract_images,range(len(filedata))) for out in res: (fmap,fail_path,err) = out @@ -357,6 +356,8 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): logging.error(err_msg) else: fm.write(fmap) + p.join() + p.close() fm.close() logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!') From bf828388f911be950aa0d657ebcf58781c97430b Mon Sep 17 00:00:00 2001 From: Ramon Luis Correa Medero Date: Mon, 5 Apr 2021 17:09:17 -0400 Subject: [PATCH 15/20] removed hardcoded log line --- modules/png-extraction/ImageExtractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py index 8d33b1e..41f1d11 100644 --- a/modules/png-extraction/ImageExtractor.py +++ b/modules/png-extraction/ImageExtractor.py @@ -296,7 +296,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): pickle.dump(filelist,open(pickle_file,'wb')) file_chunks = np.array_split(filelist,no_splits) logging.info('Number of dicom files: ' + str(len(filelist))) -logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) ) try: ff = filelist[0] #load first file as a template to look at all From 2d097329078316a2d283beb384eeaf671ae692c1 Mon Sep 17 00:00:00 2001 From: Ramon Luis Correa Medero Date: Tue, 6 Apr 2021 00:44:40 -0400 Subject: [PATCH 16/20] updates to error reporting, pool processing, Pooling update: 1. Changed pooling to use with satement instead of explicit calls to close and joins. The context manager handles this better than i would. 2. With statement results in better error reporting for edge cases. Strange print statement is preserved as i run more test. Uploading here for record purposes --- modules/png-extraction/ImageExtractor.py | 32 +++++++++++++----------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py index 41f1d11..d95a05d 100644 --- a/modules/png-extraction/ImageExtractor.py +++ b/modules/png-extraction/ImageExtractor.py @@ -15,6 +15,7 @@ import numpy as np import pandas as pd import pydicom as dicom +import png #pydicom imports needed to handle data errrors from pydicom import config from pydicom import datadict @@ -163,8 +164,7 @@ def extract_images(i): folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \ hashlib.sha224(ID2.encode('utf-8')).hexdigest() #check for existence of the folder tree patient/study/series. Create if it does not exist. - if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time. - os.makedirs(png_destination + folderName) + os.makedirs(png_destination + folderName,exist_ok=True) else: ID1=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient. try: @@ -220,6 +220,9 @@ def extract_images(i): except BaseException as error: found_err = error logging.error(found_err) + print('---pokemon--') + print(error) + print(found_err) fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' except Exception as error: found_err = error @@ -296,6 +299,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): pickle.dump(filelist,open(pickle_file,'wb')) file_chunks = np.array_split(filelist,no_splits) logging.info('Number of dicom files: ' + str(len(filelist))) +logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) ) try: ff = filelist[0] #load first file as a template to look at all @@ -344,19 +348,17 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): filedata=data total = len(chunk) stamp = time.time() - p = Pool(core_count) - res = p.imap_unordered(extract_images,range(len(filedata))) - for out in res: - (fmap,fail_path,err) = out - if err: - count +=1 - copyfile(fail_path[0],fail_path[1]) - err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction' - logging.error(err_msg) - else: - fm.write(fmap) - p.join() - p.close() + with Pool(core_count) as p: + res = p.imap_unordered(extract_images,range(len(filedata))) + for out in res: + (fmap,fail_path,err) = out + if err: + count +=1 + copyfile(fail_path[0],fail_path[1]) + err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction' + logging.error(err_msg) + else: + fm.write(fmap) fm.close() logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!') From b30fda265f08123868b573c4fc6f19de4de131f6 Mon Sep 17 00:00:00 2001 From: Ramon Luis Correa Medero Date: Tue, 6 Apr 2021 12:15:02 -0400 Subject: [PATCH 17/20] remove all makedir calls --- modules/ImageExtractor.py | 396 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 396 insertions(+) create mode 100644 modules/ImageExtractor.py diff --git a/modules/ImageExtractor.py b/modules/ImageExtractor.py new file mode 100644 index 0000000..3422f5f --- /dev/null +++ b/modules/ImageExtractor.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import glob +from shutil import copyfile +import hashlib +import json +import sys +import subprocess +import logging +from multiprocessing import Pool +import pdb +import time +import pickle +import numpy as np +import pandas as pd +import pydicom as dicom +import png +#pydicom imports needed to handle data errrors +from pydicom import config +from pydicom import datadict +from pydicom import values + +with open('config.json', 'r') as f: + niffler = json.load(f) + +#Get variables for StoreScp from config.json. +print_images = niffler['PrintImages'] +print_only_common_headers = niffler['CommonHeadersOnly'] +dicom_home = niffler['DICOMHome'] #the folder containing your dicom files +output_directory = niffler['OutputDirectory'] +depth = niffler['Depth'] +processes = niffler['UseProcesses'] #how many processes to use. +flattened_to_level = niffler['FlattenedToLevel'] +email = niffler['YourEmail'] +send_email = niffler['SendEmail'] +no_splits = niffler['SplitIntoChunks'] +is16Bit = niffler['is16Bit'] + +png_destination = output_directory + '/extracted-images/' +failed = output_directory +'/failed-dicom/' +maps_directory = output_directory + '/maps/' +meta_directory = output_directory + '/meta/' + +LOG_FILENAME = output_directory + '/ImageExtractor.out' +pickle_file = output_directory + '/ImageExtractor.pickle' +# record the start time +t_start = time.time() + +if not os.path.exists(output_directory): + os.makedirs(output_directory) + +logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) + +if not os.path.exists(maps_directory): + os.makedirs(maps_directory) + +if not os.path.exists(meta_directory): + os.makedirs(meta_directory) + +if not os.path.exists(png_destination): + os.makedirs(png_destination) + +if not os.path.exists(failed): + os.makedirs(failed) + +if not os.path.exists(failed + "/1"): + os.makedirs(failed + "/1") + +if not os.path.exists(failed + "/2"): + os.makedirs(failed + "/2") + +if not os.path.exists(failed + "/3"): + os.makedirs(failed + "/3") + +if not os.path.exists(failed + "/4"): + os.makedirs(failed + "/4") + +#%%Function for getting tuple for field,val pairs +def get_tuples(plan, outlist = None, key = ""): + if len(key)>0: + key = key + "_" + if not outlist: + outlist = [] + for aa in plan.dir(): + try: + hasattr(plan,aa) + except TypeError as e: + logging.warning('Type Error encountered') + if (hasattr(plan, aa) and aa!='PixelData'): + value = getattr(plan, aa) + start = len(outlist) + #if dicom sequence extract tags from each element + if type(value) is dicom.sequence.Sequence: + for nn, ss in enumerate(list(value)): + newkey = "_".join([key,("%d"%nn),aa]) if len(key) else "_".join([("%d"%nn),aa]) + candidate = get_tuples(ss,outlist=None,key=newkey) + #if extracted tuples are too big condense to a string + if len(candidate)>2000: + outlist.append((newkey,str(candidate))) + else: + outlist.extend(candidate) + else: + if type(value) is dicom.valuerep.DSfloat: + value = float(value) + elif type(value) is dicom.valuerep.IS: + value = str(value) + elif type(value) is dicom.valuerep.MultiValue: + value = tuple(value) + elif type(value) is dicom.uid.UID: + value = str(value) + outlist.append((key + aa, value)) #appends name, value pair for this file. these are later concatenated to the dataframe + return outlist + + +def extract_headers(f_list_elem): + nn,ff = f_list_elem # unpack enumerated list + plan = dicom.dcmread(ff, force=True) #reads in dicom file + #checks if this file has an image + c=True + try: + check=plan.pixel_array #throws error if dicom file has no image + except: + c = False + kv = get_tuples(plan) #gets tuple for field,val pairs for this file. function defined above + # dicom images should not have more than 300 + if len(kv)>500: + logging.debug(str(len(kv)) + " dicoms produced by " + ff) + kv.append(('file',chunk[nn])) #adds my custom field with the original filepath + kv.append(('has_pix_array',c)) #adds my custom field with if file has image + if c: + kv.append(('category','uncategorized')) #adds my custom category field - useful if classifying images before processing + else: + kv.append(('category','no image')) #adds my custom category field, makes note as imageless + return dict(kv) + +#%%Function to extract pixel array information +#takes an integer used to index into the global filedata dataframe +#returns tuple of +# filemapping: dicom to png paths (as str) +# fail_path: dicom to failed folder (as tuple) +# found_err: error code produced when processing +def extract_images(i): + ds = dicom.dcmread(filedata.iloc[i].loc['file'], force=True) #read file in + found_err=None + filemapping = "" + fail_path = "" + try: + im=ds.pixel_array #pull image from read dicom + imName=os.path.split(filedata.iloc[i].loc['file'])[1][:-4] #get file name ex: IM-0107-0022 + + if flattened_to_level == 'patient': + ID=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient. + folderName = hashlib.sha224(ID.encode('utf-8')).hexdigest() + #check for existence of patient folder. Create if it does not exist. + os.makedirs(png_destination + folderName,exist_ok=True) + elif flattened_to_level == 'study': + ID1=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient. + try: + ID2=filedata.iloc[i].loc['StudyInstanceUID'] # Unique identifier for the Study. + except: + ID2='ALL-STUDIES' + folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \ + hashlib.sha224(ID2.encode('utf-8')).hexdigest() + #check for existence of the folder tree patient/study/series. Create if it does not exist. + os.makedirs(png_destination + folderName,exist_ok=True) + else: + ID1=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient. + try: + ID2=filedata.iloc[i].loc['StudyInstanceUID'] # Unique identifier for the Study. + ID3=filedata.iloc[i].loc['SeriesInstanceUID'] # Unique identifier of the Series. + except: + ID2='ALL-STUDIES' + ID3='ALL-SERIES' + folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \ + hashlib.sha224(ID2.encode('utf-8')).hexdigest() + "/" + hashlib.sha224(ID3.encode('utf-8')).hexdigest() + #check for existence of the folder tree patient/study/series. Create if it does not exist. + os.makedirs(png_destination + folderName,exist_ok=True) + + + pngfile = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png' + dicom_path = filedata.iloc[i].loc['file'] + image_path = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png' + if is16Bit: + # write the PNG file as a 16-bit greyscale + image_2d = ds.pixel_array.astype(np.double) + # # Rescaling grey scale between 0-255 + image_2d_scaled = (np.maximum(image_2d,0) / image_2d.max()) * 65535.0 + # # Convert to uint + shape = ds.pixel_array.shape + image_2d_scaled = np.uint16(image_2d_scaled) + with open(pngfile , 'wb') as png_file: + w = png.Writer(shape[1], shape[0], greyscale=True,bitdepth=16) + w.write(png_file, image_2d_scaled) + else: + shape = ds.pixel_array.shape + # # Convert to float to avoid overflow or underflow losses. + image_2d = ds.pixel_array.astype(float) + # + # # Rescaling grey scale between 0-255 + image_2d_scaled = (np.maximum(image_2d,0) / image_2d.max()) * 255.0 + # + # # Convert to uint + image_2d_scaled = np.uint8(image_2d_scaled) + # # Write the PNG file + with open(pngfile , 'wb') as png_file: + w = png.Writer(shape[1], shape[0], greyscale=True) + w.write(png_file, image_2d_scaled) + filemapping = filedata.iloc[i].loc['file'] + ', ' + pngfile + '\n' + except AttributeError as error: + found_err = error + logging.error(found_err) + fail_path = filedata.iloc[i].loc['file'], failed + '1/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' + except ValueError as error: + found_err = error + logging.error(found_err) + fail_path = filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' + except BaseException as error: + found_err = error + logging.error(found_err) + fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' + except Exception as error: + found_err = error + logging.error(found_err) + fail_path = filedata.iloc[i].loc['file'], failed + '4/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' + return (filemapping,fail_path,found_err) + + +#%%Function when pydicom fails to read a value attempt to read as +#other types. +def fix_mismatch_callback(raw_elem, **kwargs): + try: + values.convert_value(raw_elem.VR, raw_elem) + except TypeError: + for vr in kwargs['with_VRs']: + try: + values.convert_value(vr, raw_elem) + except TypeError: + pass + else: + raw_elem = raw_elem._replace(VR=vr) + break + return raw_elem + + +def get_path(depth): + directory = dicom_home + '/' + i = 0 + while i < depth: + directory += "*/" + i += 1 + return directory + "*.dcm" + +#%%Function used by pydicom. +def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): + """A callback function to check that RawDataElements are translatable + with their provided VRs. If not, re-attempt translation using + some other translators. + Parameters + ---------- + with_VRs : list, [['PN', 'DS', 'IS']] + A list of VR strings to attempt if the raw data element value cannot + be translated with the raw data element's VR. + Returns + ------- + No return value. The callback function will return either + the original RawDataElement instance, or one with a fixed VR. + """ + dicom.config.data_element_callback = fix_mismatch_callback + config.data_element_callback_kwargs = { + 'with_VRs': with_VRs, + } + +fix_mismatch() +if processes == 0.5: # use half the cores to avoid high ram usage + core_count = int(os.cpu_count()/2) +elif processes == 0: # use all the cores + core_count = int(os.cpu_count()) +elif processes < os.cpu_count(): # use the specified number of cores to avoid high ram usage + core_count = processes +else: + core_count = int(os.cpu_count()) +#%% get set up to create dataframe +dirs = os.listdir(dicom_home) +#gets all dicom files. if editing this code, get filelist into the format of a list of strings, +#with each string as the file path to a different dicom file. +file_path = get_path(depth) + +if os.path.isfile(pickle_file): + f=open(pickle_file,'rb') + filelist=pickle.load(f) +else: + filelist=glob.glob(file_path, recursive=True) #this searches the folders at the depth we request and finds all dicoms + pickle.dump(filelist,open(pickle_file,'wb')) +file_chunks = np.array_split(filelist,no_splits) +logging.info('Number of dicom files: ' + str(len(filelist))) +logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) ) + +try: + ff = filelist[0] #load first file as a template to look at all +except IndexError: + logging.error("There is no file present in the given folder in " + file_path) + sys.exit(1) + +plan = dicom.dcmread(ff, force=True) +logging.debug('Loaded the first file successfully') + +keys = [(aa) for aa in plan.dir() if (hasattr(plan, aa) and aa!='PixelData')] +#%%checks for images in fields and prints where they are +for field in plan.dir(): + if (hasattr(plan, field) and field!='PixelData'): + entry = getattr(plan, field) + if type(entry) is bytes: + logging.debug(field) + logging.debug(str(entry)) +for i,chunk in enumerate(file_chunks): + csv_destination = "{}/meta/metadata_{}.csv".format(output_directory,i) + mappings ="{}/maps/mapping_{}.csv".format(output_directory,i) + fm = open(mappings, "w+") + filemapping = 'Original DICOM file location, PNG location \n' + fm.write(filemapping) + # add a check to see if the metadata has already been extracted + #%%step through whole file list, read in file, append fields to future dataframe of all files + headerlist = [] + #start up a multi processing pool + #for every item in filelist send data to a subprocess and run extract_headers func + #output is then added to headerlist as they are completed (no ordering is done) + with Pool(core_count) as p: + res= p.imap_unordered(extract_headers,enumerate(chunk)) + for i,e in enumerate(res): + headerlist.append(e) + data = pd.DataFrame(headerlist) + logging.info('Chunk ' + str(i) + ' Number of fields per file : ' + str(len(data.columns))) + #%%find common fields + #make dataframe containing all fields and all files minus those removed in previous block + #%%export csv file of final dataframe + export_csv = data.to_csv (csv_destination, index = None, header=True) + fields=data.keys() + count = 0 #potential painpoint + #writting of log handled by main process + if print_images: + logging.info("Start processing Images") + filedata=data + total = len(chunk) + stamp = time.time() + with Pool(core_count) as p: + res = p.imap_unordered(extract_images,range(len(filedata))) + for out in res: + (fmap,fail_path,err) = out + if err: + count +=1 + copyfile(fail_path[0],fail_path[1]) + err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction' + logging.error(err_msg) + else: + fm.write(fmap) + fm.close() + logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!') + + +logging.info('Generating final metadata file') + +#identify the +col_names= set() +metas = glob.glob( "{}*.csv".format(meta_directory)) +#for each meta file identify the columns that are not na's for 90% of data +for meta in metas: + m = pd.read_csv(meta,dtype='str') + d_len = m.shape[0] + interest_names = [e for e in m.columns if ( (m[e]. isna()==True).sum() /d_len ) <.9 ] #count if percentage > .9 + col_names.update(interest_names) +#load every metadata file using only valid columns +meta_list = list() +for meta in metas: + m = pd.read_csv(meta,dtype='str',usecols=col_names) + meta_list.append(m) +merged_meta = pd.concat(meta_list,ignore_index=True) +merged_meta.to_csv('{}/metadata.csv'.format(output_directory),index=False) +#getting a single mapping file +logging.info('Generatign final mapping file') +mappings = glob.glob("{}/maps/*.csv".format(output_directory)) +map_list = list() +for mapping in mappings: + map_list.append(pd.read_csv(mapping,dtype='str')) +merged_maps = pd.concat(map_list,ignore_index=True) +if print_only_common_headers: + mask_common_fields = merged_maps.isnull().mean() < 0.1 + common_fields = set(np.asarray(merged_maps.columns)[mask_common_fields]) + merged_maps = merged_maps[common_fields] +merged_maps.to_csv('{}/mapping.csv'.format(output_directory),index=False) + + +if send_email: + subprocess.call('echo "Niffler has successfully completed the png conversion" | mail -s "The image conversion has been complete" {0}'.format(email), shell=True) +# Record the total run-time +logging.info('Total run time: %s %s', time.time() - t_start, ' seconds!') From 19fbe867ba9e0d5c8f51c38edad40a2569eb3d48 Mon Sep 17 00:00:00 2001 From: Ramon Luis Correa Medero Date: Tue, 6 Apr 2021 12:16:56 -0400 Subject: [PATCH 18/20] remove makedir calls --- modules/png-extraction/ImageExtractor.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py index d95a05d..3422f5f 100644 --- a/modules/png-extraction/ImageExtractor.py +++ b/modules/png-extraction/ImageExtractor.py @@ -153,8 +153,7 @@ def extract_images(i): ID=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient. folderName = hashlib.sha224(ID.encode('utf-8')).hexdigest() #check for existence of patient folder. Create if it does not exist. - if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time. - os.mkdir(png_destination + folderName) + os.makedirs(png_destination + folderName,exist_ok=True) elif flattened_to_level == 'study': ID1=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient. try: @@ -176,8 +175,7 @@ def extract_images(i): folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \ hashlib.sha224(ID2.encode('utf-8')).hexdigest() + "/" + hashlib.sha224(ID3.encode('utf-8')).hexdigest() #check for existence of the folder tree patient/study/series. Create if it does not exist. - if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time. - os.makedirs(png_destination + folderName) + os.makedirs(png_destination + folderName,exist_ok=True) pngfile = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png' @@ -220,9 +218,6 @@ def extract_images(i): except BaseException as error: found_err = error logging.error(found_err) - print('---pokemon--') - print(error) - print(found_err) fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' except Exception as error: found_err = error From 7cc2832a1e46d26ab7d3466b188e28907041e2de Mon Sep 17 00:00:00 2001 From: Ramon Luis Correa Medero Date: Tue, 6 Apr 2021 12:17:28 -0400 Subject: [PATCH 19/20] remove accidental pussh to wrong dir --- modules/ImageExtractor.py | 396 -------------------------------------- 1 file changed, 396 deletions(-) delete mode 100644 modules/ImageExtractor.py diff --git a/modules/ImageExtractor.py b/modules/ImageExtractor.py deleted file mode 100644 index 3422f5f..0000000 --- a/modules/ImageExtractor.py +++ /dev/null @@ -1,396 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import os -import glob -from shutil import copyfile -import hashlib -import json -import sys -import subprocess -import logging -from multiprocessing import Pool -import pdb -import time -import pickle -import numpy as np -import pandas as pd -import pydicom as dicom -import png -#pydicom imports needed to handle data errrors -from pydicom import config -from pydicom import datadict -from pydicom import values - -with open('config.json', 'r') as f: - niffler = json.load(f) - -#Get variables for StoreScp from config.json. -print_images = niffler['PrintImages'] -print_only_common_headers = niffler['CommonHeadersOnly'] -dicom_home = niffler['DICOMHome'] #the folder containing your dicom files -output_directory = niffler['OutputDirectory'] -depth = niffler['Depth'] -processes = niffler['UseProcesses'] #how many processes to use. -flattened_to_level = niffler['FlattenedToLevel'] -email = niffler['YourEmail'] -send_email = niffler['SendEmail'] -no_splits = niffler['SplitIntoChunks'] -is16Bit = niffler['is16Bit'] - -png_destination = output_directory + '/extracted-images/' -failed = output_directory +'/failed-dicom/' -maps_directory = output_directory + '/maps/' -meta_directory = output_directory + '/meta/' - -LOG_FILENAME = output_directory + '/ImageExtractor.out' -pickle_file = output_directory + '/ImageExtractor.pickle' -# record the start time -t_start = time.time() - -if not os.path.exists(output_directory): - os.makedirs(output_directory) - -logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG) - -if not os.path.exists(maps_directory): - os.makedirs(maps_directory) - -if not os.path.exists(meta_directory): - os.makedirs(meta_directory) - -if not os.path.exists(png_destination): - os.makedirs(png_destination) - -if not os.path.exists(failed): - os.makedirs(failed) - -if not os.path.exists(failed + "/1"): - os.makedirs(failed + "/1") - -if not os.path.exists(failed + "/2"): - os.makedirs(failed + "/2") - -if not os.path.exists(failed + "/3"): - os.makedirs(failed + "/3") - -if not os.path.exists(failed + "/4"): - os.makedirs(failed + "/4") - -#%%Function for getting tuple for field,val pairs -def get_tuples(plan, outlist = None, key = ""): - if len(key)>0: - key = key + "_" - if not outlist: - outlist = [] - for aa in plan.dir(): - try: - hasattr(plan,aa) - except TypeError as e: - logging.warning('Type Error encountered') - if (hasattr(plan, aa) and aa!='PixelData'): - value = getattr(plan, aa) - start = len(outlist) - #if dicom sequence extract tags from each element - if type(value) is dicom.sequence.Sequence: - for nn, ss in enumerate(list(value)): - newkey = "_".join([key,("%d"%nn),aa]) if len(key) else "_".join([("%d"%nn),aa]) - candidate = get_tuples(ss,outlist=None,key=newkey) - #if extracted tuples are too big condense to a string - if len(candidate)>2000: - outlist.append((newkey,str(candidate))) - else: - outlist.extend(candidate) - else: - if type(value) is dicom.valuerep.DSfloat: - value = float(value) - elif type(value) is dicom.valuerep.IS: - value = str(value) - elif type(value) is dicom.valuerep.MultiValue: - value = tuple(value) - elif type(value) is dicom.uid.UID: - value = str(value) - outlist.append((key + aa, value)) #appends name, value pair for this file. these are later concatenated to the dataframe - return outlist - - -def extract_headers(f_list_elem): - nn,ff = f_list_elem # unpack enumerated list - plan = dicom.dcmread(ff, force=True) #reads in dicom file - #checks if this file has an image - c=True - try: - check=plan.pixel_array #throws error if dicom file has no image - except: - c = False - kv = get_tuples(plan) #gets tuple for field,val pairs for this file. function defined above - # dicom images should not have more than 300 - if len(kv)>500: - logging.debug(str(len(kv)) + " dicoms produced by " + ff) - kv.append(('file',chunk[nn])) #adds my custom field with the original filepath - kv.append(('has_pix_array',c)) #adds my custom field with if file has image - if c: - kv.append(('category','uncategorized')) #adds my custom category field - useful if classifying images before processing - else: - kv.append(('category','no image')) #adds my custom category field, makes note as imageless - return dict(kv) - -#%%Function to extract pixel array information -#takes an integer used to index into the global filedata dataframe -#returns tuple of -# filemapping: dicom to png paths (as str) -# fail_path: dicom to failed folder (as tuple) -# found_err: error code produced when processing -def extract_images(i): - ds = dicom.dcmread(filedata.iloc[i].loc['file'], force=True) #read file in - found_err=None - filemapping = "" - fail_path = "" - try: - im=ds.pixel_array #pull image from read dicom - imName=os.path.split(filedata.iloc[i].loc['file'])[1][:-4] #get file name ex: IM-0107-0022 - - if flattened_to_level == 'patient': - ID=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient. - folderName = hashlib.sha224(ID.encode('utf-8')).hexdigest() - #check for existence of patient folder. Create if it does not exist. - os.makedirs(png_destination + folderName,exist_ok=True) - elif flattened_to_level == 'study': - ID1=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient. - try: - ID2=filedata.iloc[i].loc['StudyInstanceUID'] # Unique identifier for the Study. - except: - ID2='ALL-STUDIES' - folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \ - hashlib.sha224(ID2.encode('utf-8')).hexdigest() - #check for existence of the folder tree patient/study/series. Create if it does not exist. - os.makedirs(png_destination + folderName,exist_ok=True) - else: - ID1=filedata.iloc[i].loc['PatientID'] # Unique identifier for the Patient. - try: - ID2=filedata.iloc[i].loc['StudyInstanceUID'] # Unique identifier for the Study. - ID3=filedata.iloc[i].loc['SeriesInstanceUID'] # Unique identifier of the Series. - except: - ID2='ALL-STUDIES' - ID3='ALL-SERIES' - folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \ - hashlib.sha224(ID2.encode('utf-8')).hexdigest() + "/" + hashlib.sha224(ID3.encode('utf-8')).hexdigest() - #check for existence of the folder tree patient/study/series. Create if it does not exist. - os.makedirs(png_destination + folderName,exist_ok=True) - - - pngfile = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png' - dicom_path = filedata.iloc[i].loc['file'] - image_path = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png' - if is16Bit: - # write the PNG file as a 16-bit greyscale - image_2d = ds.pixel_array.astype(np.double) - # # Rescaling grey scale between 0-255 - image_2d_scaled = (np.maximum(image_2d,0) / image_2d.max()) * 65535.0 - # # Convert to uint - shape = ds.pixel_array.shape - image_2d_scaled = np.uint16(image_2d_scaled) - with open(pngfile , 'wb') as png_file: - w = png.Writer(shape[1], shape[0], greyscale=True,bitdepth=16) - w.write(png_file, image_2d_scaled) - else: - shape = ds.pixel_array.shape - # # Convert to float to avoid overflow or underflow losses. - image_2d = ds.pixel_array.astype(float) - # - # # Rescaling grey scale between 0-255 - image_2d_scaled = (np.maximum(image_2d,0) / image_2d.max()) * 255.0 - # - # # Convert to uint - image_2d_scaled = np.uint8(image_2d_scaled) - # # Write the PNG file - with open(pngfile , 'wb') as png_file: - w = png.Writer(shape[1], shape[0], greyscale=True) - w.write(png_file, image_2d_scaled) - filemapping = filedata.iloc[i].loc['file'] + ', ' + pngfile + '\n' - except AttributeError as error: - found_err = error - logging.error(found_err) - fail_path = filedata.iloc[i].loc['file'], failed + '1/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' - except ValueError as error: - found_err = error - logging.error(found_err) - fail_path = filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' - except BaseException as error: - found_err = error - logging.error(found_err) - fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' - except Exception as error: - found_err = error - logging.error(found_err) - fail_path = filedata.iloc[i].loc['file'], failed + '4/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm' - return (filemapping,fail_path,found_err) - - -#%%Function when pydicom fails to read a value attempt to read as -#other types. -def fix_mismatch_callback(raw_elem, **kwargs): - try: - values.convert_value(raw_elem.VR, raw_elem) - except TypeError: - for vr in kwargs['with_VRs']: - try: - values.convert_value(vr, raw_elem) - except TypeError: - pass - else: - raw_elem = raw_elem._replace(VR=vr) - break - return raw_elem - - -def get_path(depth): - directory = dicom_home + '/' - i = 0 - while i < depth: - directory += "*/" - i += 1 - return directory + "*.dcm" - -#%%Function used by pydicom. -def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): - """A callback function to check that RawDataElements are translatable - with their provided VRs. If not, re-attempt translation using - some other translators. - Parameters - ---------- - with_VRs : list, [['PN', 'DS', 'IS']] - A list of VR strings to attempt if the raw data element value cannot - be translated with the raw data element's VR. - Returns - ------- - No return value. The callback function will return either - the original RawDataElement instance, or one with a fixed VR. - """ - dicom.config.data_element_callback = fix_mismatch_callback - config.data_element_callback_kwargs = { - 'with_VRs': with_VRs, - } - -fix_mismatch() -if processes == 0.5: # use half the cores to avoid high ram usage - core_count = int(os.cpu_count()/2) -elif processes == 0: # use all the cores - core_count = int(os.cpu_count()) -elif processes < os.cpu_count(): # use the specified number of cores to avoid high ram usage - core_count = processes -else: - core_count = int(os.cpu_count()) -#%% get set up to create dataframe -dirs = os.listdir(dicom_home) -#gets all dicom files. if editing this code, get filelist into the format of a list of strings, -#with each string as the file path to a different dicom file. -file_path = get_path(depth) - -if os.path.isfile(pickle_file): - f=open(pickle_file,'rb') - filelist=pickle.load(f) -else: - filelist=glob.glob(file_path, recursive=True) #this searches the folders at the depth we request and finds all dicoms - pickle.dump(filelist,open(pickle_file,'wb')) -file_chunks = np.array_split(filelist,no_splits) -logging.info('Number of dicom files: ' + str(len(filelist))) -logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) ) - -try: - ff = filelist[0] #load first file as a template to look at all -except IndexError: - logging.error("There is no file present in the given folder in " + file_path) - sys.exit(1) - -plan = dicom.dcmread(ff, force=True) -logging.debug('Loaded the first file successfully') - -keys = [(aa) for aa in plan.dir() if (hasattr(plan, aa) and aa!='PixelData')] -#%%checks for images in fields and prints where they are -for field in plan.dir(): - if (hasattr(plan, field) and field!='PixelData'): - entry = getattr(plan, field) - if type(entry) is bytes: - logging.debug(field) - logging.debug(str(entry)) -for i,chunk in enumerate(file_chunks): - csv_destination = "{}/meta/metadata_{}.csv".format(output_directory,i) - mappings ="{}/maps/mapping_{}.csv".format(output_directory,i) - fm = open(mappings, "w+") - filemapping = 'Original DICOM file location, PNG location \n' - fm.write(filemapping) - # add a check to see if the metadata has already been extracted - #%%step through whole file list, read in file, append fields to future dataframe of all files - headerlist = [] - #start up a multi processing pool - #for every item in filelist send data to a subprocess and run extract_headers func - #output is then added to headerlist as they are completed (no ordering is done) - with Pool(core_count) as p: - res= p.imap_unordered(extract_headers,enumerate(chunk)) - for i,e in enumerate(res): - headerlist.append(e) - data = pd.DataFrame(headerlist) - logging.info('Chunk ' + str(i) + ' Number of fields per file : ' + str(len(data.columns))) - #%%find common fields - #make dataframe containing all fields and all files minus those removed in previous block - #%%export csv file of final dataframe - export_csv = data.to_csv (csv_destination, index = None, header=True) - fields=data.keys() - count = 0 #potential painpoint - #writting of log handled by main process - if print_images: - logging.info("Start processing Images") - filedata=data - total = len(chunk) - stamp = time.time() - with Pool(core_count) as p: - res = p.imap_unordered(extract_images,range(len(filedata))) - for out in res: - (fmap,fail_path,err) = out - if err: - count +=1 - copyfile(fail_path[0],fail_path[1]) - err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction' - logging.error(err_msg) - else: - fm.write(fmap) - fm.close() - logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!') - - -logging.info('Generating final metadata file') - -#identify the -col_names= set() -metas = glob.glob( "{}*.csv".format(meta_directory)) -#for each meta file identify the columns that are not na's for 90% of data -for meta in metas: - m = pd.read_csv(meta,dtype='str') - d_len = m.shape[0] - interest_names = [e for e in m.columns if ( (m[e]. isna()==True).sum() /d_len ) <.9 ] #count if percentage > .9 - col_names.update(interest_names) -#load every metadata file using only valid columns -meta_list = list() -for meta in metas: - m = pd.read_csv(meta,dtype='str',usecols=col_names) - meta_list.append(m) -merged_meta = pd.concat(meta_list,ignore_index=True) -merged_meta.to_csv('{}/metadata.csv'.format(output_directory),index=False) -#getting a single mapping file -logging.info('Generatign final mapping file') -mappings = glob.glob("{}/maps/*.csv".format(output_directory)) -map_list = list() -for mapping in mappings: - map_list.append(pd.read_csv(mapping,dtype='str')) -merged_maps = pd.concat(map_list,ignore_index=True) -if print_only_common_headers: - mask_common_fields = merged_maps.isnull().mean() < 0.1 - common_fields = set(np.asarray(merged_maps.columns)[mask_common_fields]) - merged_maps = merged_maps[common_fields] -merged_maps.to_csv('{}/mapping.csv'.format(output_directory),index=False) - - -if send_email: - subprocess.call('echo "Niffler has successfully completed the png conversion" | mail -s "The image conversion has been complete" {0}'.format(email), shell=True) -# Record the total run-time -logging.info('Total run time: %s %s', time.time() - t_start, ' seconds!') From d4de24c4b7f5437f24cdf8d15effe98744dcd986 Mon Sep 17 00:00:00 2001 From: Ramon Luis Correa Medero Date: Tue, 6 Apr 2021 12:20:23 -0400 Subject: [PATCH 20/20] remove erroneous logging --- modules/png-extraction/ImageExtractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py index 3422f5f..909dbad 100644 --- a/modules/png-extraction/ImageExtractor.py +++ b/modules/png-extraction/ImageExtractor.py @@ -294,7 +294,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']): pickle.dump(filelist,open(pickle_file,'wb')) file_chunks = np.array_split(filelist,no_splits) logging.info('Number of dicom files: ' + str(len(filelist))) -logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) ) try: ff = filelist[0] #load first file as a template to look at all