From 87d894b681444d8120241a9a3e041040f1b6c476 Mon Sep 17 00:00:00 2001
From: Nishchal-007 <letscollabtogether10@gmail.com>
Date: Tue, 30 Mar 2021 20:25:43 +0530
Subject: [PATCH 01/20] missing csv entries

---
 modules/cold-extraction/ColdDataRetriever.py | 60 ++++++++++----------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py
index 24c863c..d358375 100644
--- a/modules/cold-extraction/ColdDataRetriever.py
+++ b/modules/cold-extraction/ColdDataRetriever.py
@@ -130,24 +130,40 @@ def initialize():
 with open(csv_file, newline='') as f:
     reader = csv.reader(f)
     next(f)
+
+    # Changed below part for finding missing csv entries and skipping them
     for row in reader:
+        row = [x.strip() for x in row]
+        #print(row)
         if (extraction_type == 'empi_date'):
-            patients.append(row[patient_index])
-            temp_date = row[date_index]
-            dt_stamp = datetime.datetime.strptime(temp_date, date_format)
-            date_str = dt_stamp.strftime('%Y%m%d')
-            dates.append(date_str)
-            length = len(patients)
+            if set(row).pop()=='':
+                pass
+            else:
+                patients.append(row[patient_index])
+                temp_date = row[date_index]
+                dt_stamp = datetime.datetime.strptime(temp_date, date_format)
+                date_str = dt_stamp.strftime('%Y%m%d')
+                dates.append(date_str)
+                length = len(patients)
         elif (extraction_type == 'empi'):
-            patients.append(row[patient_index])
-            length = len(patients)
+            if set(row).pop()=='':
+                pass
+            else:
+                patients.append(row[patient_index])
+                length = len(patients)
         elif (extraction_type == 'accession'):
-            accessions.append(row[accession_index])
-            length = len(accessions)
+            if set(row).pop()=='':
+                pass
+            else:
+                accessions.append(row[accession_index])
+                length = len(accessions)
         elif (extraction_type == 'empi_accession'):
-            patients.append(row[patient_index])
-            accessions.append(row[accession_index])
-            length = len(accessions)
+            if set(row).pop()=='':
+                pass
+            else:
+                patients.append(row[patient_index])
+                accessions.append(row[accession_index])
+                length = len(accessions)
 
 
 # Run the retrieval only once, when the extraction script starts, and keep it running in a separate thread.
@@ -180,22 +196,6 @@ def retrieve():
                 subprocess.call("{0}/movescu -c {1} -b {2} -M PatientRoot -m PatientID={3} -m AccessionNumber={4} --dest {5}".format(DCM4CHE_BIN, SRC_AET, QUERY_AET, PatientID, Accession, DEST_AET), shell=True)
                 extracted_ones.append(temp_id)
 
-    # For the cases that have the EMPI.
-    elif (extraction_type == 'empi'):
-        # Create our Identifier (query) dataset
-        for pid in range(0, len(patients)):
-            PatientID = patients[pid]
-            if NIGHTLY_ONLY:
-                if (datetime.datetime.now().hour >= END_HOUR and datetime.datetime.now().hour < START_HOUR):
-                    # log once while sleeping
-                    logging.info("Nightly mode. Niffler schedules the extraction to resume at start hour {0} and start within 30 minutes after that. It will then pause at the end hour {1}".format(START_HOUR, END_HOUR))
-                while (datetime.datetime.now().hour >= END_HOUR and datetime.datetime.now().hour < START_HOUR):
-                    # sleep for 5 minutes
-                    time.sleep(300)
-            if ((not resume) or (resume and (PatientID not in extracted_ones))):
-                subprocess.call("{0}/movescu -c {1} -b {2} -M PatientRoot -m PatientID={3} --dest {4}".format(DCM4CHE_BIN, SRC_AET, QUERY_AET, PatientID, DEST_AET), shell=True)
-                extracted_ones.append(PatientID)
-
     # For the cases that does not have the typical EMPI and Accession values together.
     elif (extraction_type == 'empi_date' or extraction_type == 'accession'):
         # Create our Identifier (query) dataset
@@ -277,4 +277,4 @@ def run_threaded(job_func):
         time.sleep(1)
     except KeyboardInterrupt:
         check_kill_process()
-        sys.exit(0)
\ No newline at end of file
+        sys.exit(0)

From 45cf9afd004dbf9c08ff9589dc2f57aba3298c6f Mon Sep 17 00:00:00 2001
From: Nishchal Singi <71981858+Nishchal-007@users.noreply.github.com>
Date: Tue, 30 Mar 2021 22:03:25 +0530
Subject: [PATCH 02/20] removed comment and added missing code

---
 modules/cold-extraction/ColdDataRetriever.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py
index d358375..594a8eb 100644
--- a/modules/cold-extraction/ColdDataRetriever.py
+++ b/modules/cold-extraction/ColdDataRetriever.py
@@ -130,11 +130,10 @@ def initialize():
 with open(csv_file, newline='') as f:
     reader = csv.reader(f)
     next(f)
-
+    
     # Changed below part for finding missing csv entries and skipping them
     for row in reader:
         row = [x.strip() for x in row]
-        #print(row)
         if (extraction_type == 'empi_date'):
             if set(row).pop()=='':
                 pass
@@ -196,6 +195,22 @@ def retrieve():
                 subprocess.call("{0}/movescu -c {1} -b {2} -M PatientRoot -m PatientID={3} -m AccessionNumber={4} --dest {5}".format(DCM4CHE_BIN, SRC_AET, QUERY_AET, PatientID, Accession, DEST_AET), shell=True)
                 extracted_ones.append(temp_id)
 
+    # For the cases that have the EMPI.
+    elif (extraction_type == 'empi'):
+        # Create our Identifier (query) dataset
+        for pid in range(0, len(patients)):
+            PatientID = patients[pid]
+            if NIGHTLY_ONLY:
+                if (datetime.datetime.now().hour >= END_HOUR and datetime.datetime.now().hour < START_HOUR):
+                    # log once while sleeping
+                    logging.info("Nightly mode. Niffler schedules the extraction to resume at start hour {0} and start within 30 minutes after that. It will then pause at the end hour {1}".format(START_HOUR, END_HOUR))
+                while (datetime.datetime.now().hour >= END_HOUR and datetime.datetime.now().hour < START_HOUR):
+                    # sleep for 5 minutes
+                    time.sleep(300)
+            if ((not resume) or (resume and (PatientID not in extracted_ones))):
+                subprocess.call("{0}/movescu -c {1} -b {2} -M PatientRoot -m PatientID={3} --dest {4}".format(DCM4CHE_BIN, SRC_AET, QUERY_AET, PatientID, DEST_AET), shell=True)
+                extracted_ones.append(PatientID)
+
     # For the cases that does not have the typical EMPI and Accession values together.
     elif (extraction_type == 'empi_date' or extraction_type == 'accession'):
         # Create our Identifier (query) dataset

From 3ef6f100581c0fbf43a6152aa238427b71f5b3f7 Mon Sep 17 00:00:00 2001
From: Nishchal-007 <letscollabtogether10@gmail.com>
Date: Thu, 1 Apr 2021 18:28:25 +0530
Subject: [PATCH 03/20] Updated ColdDataRetriever.py

---
 modules/cold-extraction/ColdDataRetriever.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py
index 594a8eb..10c2176 100644
--- a/modules/cold-extraction/ColdDataRetriever.py
+++ b/modules/cold-extraction/ColdDataRetriever.py
@@ -135,7 +135,7 @@ def initialize():
     for row in reader:
         row = [x.strip() for x in row]
         if (extraction_type == 'empi_date'):
-            if set(row).pop()=='':
+            if ((row[patient_index] == "") or (row[date_index] == "")):
                 pass
             else:
                 patients.append(row[patient_index])
@@ -145,19 +145,19 @@ def initialize():
                 dates.append(date_str)
                 length = len(patients)
         elif (extraction_type == 'empi'):
-            if set(row).pop()=='':
+            if ((row[patient_index] == "")):
                 pass
             else:
                 patients.append(row[patient_index])
                 length = len(patients)
         elif (extraction_type == 'accession'):
-            if set(row).pop()=='':
+            if ((row[accession_index] == "")):
                 pass
             else:
                 accessions.append(row[accession_index])
                 length = len(accessions)
         elif (extraction_type == 'empi_accession'):
-            if set(row).pop()=='':
+            if ((row[patient_index] == "") or (row[accession_index] == "")):
                 pass
             else:
                 patients.append(row[patient_index])

From 6ce407f0eeec05db285c244eb89e248007c85a9e Mon Sep 17 00:00:00 2001
From: jeong-jasonji <55253180+jeong-jasonji@users.noreply.github.com>
Date: Thu, 1 Apr 2021 10:16:46 -0400
Subject: [PATCH 04/20] little update

- filename matches the anonymized SOPInstanceUID
- requires pydicom 1.4.2 or above...
---
 modules/png-extraction/anon_pydicom.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/modules/png-extraction/anon_pydicom.py b/modules/png-extraction/anon_pydicom.py
index b1ae396..3595d43 100644
--- a/modules/png-extraction/anon_pydicom.py
+++ b/modules/png-extraction/anon_pydicom.py
@@ -10,7 +10,6 @@
 import sys
 import pydicom
 import random
-import shutil
 import pickle
 
 
@@ -102,20 +101,17 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
             test_file = pydicom.dcmread(test_file_path)
             anon_id = anonSample(test_file, 'StudyInstanceUID', UIDs['StudyInstanceUID'])
             # make folder with the anonymized studyUID name
-            print(anon_id)
             study_folder = os.path.join(output_path, anon_id)
             os.mkdir(study_folder)
             for file in files:
-                # copy the file to the new anon folder
-                shutil.copyfile(os.path.join(dcm_folder, file), os.path.join(study_folder, file))
-                dcm_file = pydicom.dcmread(os.path.join(study_folder, file))
+                dcm_file = pydicom.dcmread(os.path.join(dcm_folder, file))
                 dcm_file.remove_private_tags()
                 for UID in UIDs.keys():
                     # get the UID and get the anonymized UID
                     anon_id = anonSample(dcm_file, UID, UIDs[UID])
                     # save instance UID to rename the filename (so that filename and SOPinstance matches)
                     if UID == 'SOPInstanceUID':
-                        new_filename = anon_id.copy()
+                        new_filename = anon_id
                     dcm_file[UID].value = anon_id
                 # for the other tags, make them anonymous
                 for tag in anon_tags:
@@ -128,9 +124,9 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
                             dcm_file.data_element(tag).value = 0
                         else:
                             dcm_file.data_element(tag).value = 0.0
-                dcm_file.save_as(os.path.join(study_folder, new_filename))
+                dcm_file.save_as(os.path.join(study_folder, new_filename + '.dcm'))
             n += 1
-            print('total files anonymized: {}/{}. Study: {}'.format(n, len(dcm_folders), study_folder), flush=True)
+            print('total folders anonymized: {}/{}. Study: {}'.format(n, len(dcm_folders), study_folder), flush=True)
         except:
             print('Invalid Dicom Error, skipping')
             skip_file = pydicom.dcmread(test_file_path, force=True)
@@ -147,9 +143,10 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
 
 if __name__ == "__main__":
     # ex: 'python anon_pydicom.py /labs/banerjeelab/researchpacs_data/ /labs/banerjeelab/HCC_anon_dcm/200_noForce/'
+    # 'python anon_pydicom.py r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler' r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_out''
     data_dir = sys.argv[1]
     output_dir = sys.argv[2]
-    if len(sys.argv) > 2:
+    if len(sys.argv) > 3:
         # stopping number
         stop = int(sys.argv[3])
     else:
@@ -157,4 +154,8 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
     print('Extracting DICOM folders', flush=True)
     dcm_folders = get_dcm_folders(data_dir)
     print('Starting DICOM Study Anonymization', flush=True)
-    dcm_anonymize(dcm_folders, output_dir, stop=stop)
+    dcm_anonymize(dcm_folders, output_dir, stop=None)
+
+
+data_dir = r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_files'
+output_dir = r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_out'
\ No newline at end of file

From 92e0d594e9fe7bdb27b73bc742b93314d2897554 Mon Sep 17 00:00:00 2001
From: jeong-jasonji <55253180+jeong-jasonji@users.noreply.github.com>
Date: Thu, 1 Apr 2021 13:59:23 -0400
Subject: [PATCH 05/20] removed hardcode

---
 modules/png-extraction/anon_pydicom.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/modules/png-extraction/anon_pydicom.py b/modules/png-extraction/anon_pydicom.py
index 3595d43..272c2cd 100644
--- a/modules/png-extraction/anon_pydicom.py
+++ b/modules/png-extraction/anon_pydicom.py
@@ -155,7 +155,3 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
     dcm_folders = get_dcm_folders(data_dir)
     print('Starting DICOM Study Anonymization', flush=True)
     dcm_anonymize(dcm_folders, output_dir, stop=None)
-
-
-data_dir = r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_files'
-output_dir = r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_out'
\ No newline at end of file

From 4e1a602505684ce14c85f4f693e9b1e46399a02c Mon Sep 17 00:00:00 2001
From: Nishchal Singi <71981858+Nishchal-007@users.noreply.github.com>
Date: Sat, 3 Apr 2021 19:33:51 +0530
Subject: [PATCH 06/20] Made required changes

---
 modules/cold-extraction/ColdDataRetriever.py | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/modules/cold-extraction/ColdDataRetriever.py b/modules/cold-extraction/ColdDataRetriever.py
index 10c2176..9f1e2d3 100644
--- a/modules/cold-extraction/ColdDataRetriever.py
+++ b/modules/cold-extraction/ColdDataRetriever.py
@@ -130,14 +130,10 @@ def initialize():
 with open(csv_file, newline='') as f:
     reader = csv.reader(f)
     next(f)
-    
-    # Changed below part for finding missing csv entries and skipping them
     for row in reader:
         row = [x.strip() for x in row]
         if (extraction_type == 'empi_date'):
-            if ((row[patient_index] == "") or (row[date_index] == "")):
-                pass
-            else:
+            if not ((row[patient_index] == "") or (row[date_index] == "")):
                 patients.append(row[patient_index])
                 temp_date = row[date_index]
                 dt_stamp = datetime.datetime.strptime(temp_date, date_format)
@@ -145,21 +141,15 @@ def initialize():
                 dates.append(date_str)
                 length = len(patients)
         elif (extraction_type == 'empi'):
-            if ((row[patient_index] == "")):
-                pass
-            else:
+            if not ((row[patient_index] == "")):
                 patients.append(row[patient_index])
                 length = len(patients)
         elif (extraction_type == 'accession'):
-            if ((row[accession_index] == "")):
-                pass
-            else:
+            if not ((row[accession_index] == "")):
                 accessions.append(row[accession_index])
                 length = len(accessions)
         elif (extraction_type == 'empi_accession'):
-            if ((row[patient_index] == "") or (row[accession_index] == "")):
-                pass
-            else:
+            if not ((row[patient_index] == "") or (row[accession_index] == "")):
                 patients.append(row[patient_index])
                 accessions.append(row[accession_index])
                 length = len(accessions)

From eab58a3f8bcd0f8826d9416bb1ca65efe97c2e78 Mon Sep 17 00:00:00 2001
From: Pradeeban Kathiravelu <kk.pradeeban@gmail.com>
Date: Sat, 3 Apr 2021 21:52:07 -0400
Subject: [PATCH 07/20] Move anon_pydicom to a new module

---
 modules/{png-extraction => dicom-anonymization}/anon_pydicom.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename modules/{png-extraction => dicom-anonymization}/anon_pydicom.py (100%)

diff --git a/modules/png-extraction/anon_pydicom.py b/modules/dicom-anonymization/anon_pydicom.py
similarity index 100%
rename from modules/png-extraction/anon_pydicom.py
rename to modules/dicom-anonymization/anon_pydicom.py

From 5f75125126efe9620fc4697bad2dd8b639e93694 Mon Sep 17 00:00:00 2001
From: Pradeeban Kathiravelu <kk.pradeeban@gmail.com>
Date: Sat, 3 Apr 2021 22:46:38 -0400
Subject: [PATCH 08/20] Anonymize Dicom files

---
 .../dicom-anonymization/{anon_pydicom.py => DicomAnonymizer.py}   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename modules/dicom-anonymization/{anon_pydicom.py => DicomAnonymizer.py} (100%)

diff --git a/modules/dicom-anonymization/anon_pydicom.py b/modules/dicom-anonymization/DicomAnonymizer.py
similarity index 100%
rename from modules/dicom-anonymization/anon_pydicom.py
rename to modules/dicom-anonymization/DicomAnonymizer.py

From ef910b0765d872d47c83977c6227d7cb7459e640 Mon Sep 17 00:00:00 2001
From: Pradeeban Kathiravelu <kk.pradeeban@gmail.com>
Date: Sat, 3 Apr 2021 22:47:52 -0400
Subject: [PATCH 09/20] Update DicomAnonymizer.py

Remove comment
---
 modules/dicom-anonymization/DicomAnonymizer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/dicom-anonymization/DicomAnonymizer.py b/modules/dicom-anonymization/DicomAnonymizer.py
index 272c2cd..b4fdaae 100644
--- a/modules/dicom-anonymization/DicomAnonymizer.py
+++ b/modules/dicom-anonymization/DicomAnonymizer.py
@@ -142,8 +142,6 @@ def dcm_anonymize(dcm_folders, output_path, stop=None):
 
 
 if __name__ == "__main__":
-    # ex: 'python anon_pydicom.py /labs/banerjeelab/researchpacs_data/ /labs/banerjeelab/HCC_anon_dcm/200_noForce/'
-    # 'python anon_pydicom.py r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler' r'C:\Users\Jason\Desktop\Code_files\HITI\Niffler\test_out''
     data_dir = sys.argv[1]
     output_dir = sys.argv[2]
     if len(sys.argv) > 3:

From cfc02822ca292ba75a869f3d1e658b4d4bd191c2 Mon Sep 17 00:00:00 2001
From: Pradeeban Kathiravelu <kk.pradeeban@gmail.com>
Date: Sat, 3 Apr 2021 22:49:57 -0400
Subject: [PATCH 10/20] Create README.md

---
 modules/dicom-anonymization/README.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 modules/dicom-anonymization/README.md

diff --git a/modules/dicom-anonymization/README.md b/modules/dicom-anonymization/README.md
new file mode 100644
index 0000000..e990d49
--- /dev/null
+++ b/modules/dicom-anonymization/README.md
@@ -0,0 +1,6 @@
+# The Niffler DICOM Anonymizer
+
+You may convert a DICOM file into an anonymized DICOM file by running 
+```
+python DicomAnonymizer.py <INPUT-DICOMS-FOLDER> <OUTPUT-DICOMS-FOLDER>
+```

From c0039b03c1654e23587a8937c08b2c3d1204d40c Mon Sep 17 00:00:00 2001
From: Pradeeban Kathiravelu <kk.pradeeban@gmail.com>
Date: Sat, 3 Apr 2021 22:51:27 -0400
Subject: [PATCH 11/20] Update README.md

Fix module reference
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 57e673f..69b01f1 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Niffler enables receiving DICOM images real-time as a data stream from PACS as w
 
 # Configure Niffler
 
-Niffler consists of 4 modules, inside the modules folder. Here we will look into the common configuration and installation steps of Niffler. An introduction to Niffler can be found [here](https://emory-hiti.github.io/Niffler/).
+Niffler consists of 5 modules, inside the modules folder. Here we will look into the common configuration and installation steps of Niffler. An introduction to Niffler can be found [here](https://emory-hiti.github.io/Niffler/).
 
 ## Configure PACS
 

From 548ba59b5696b05012ef944384c7d1a6494bee25 Mon Sep 17 00:00:00 2001
From: Pradeeban Kathiravelu <kk.pradeeban@gmail.com>
Date: Sat, 3 Apr 2021 22:52:54 -0400
Subject: [PATCH 12/20] Update index.md

---
 docs/index.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/index.md b/docs/index.md
index 8c28baa..e7f8165 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -21,6 +21,10 @@ Receives DICOM images as a stream from a PACS and extracts and stores the metada
 
 Converts a set of DICOM images into png images, extract metadata in a privacy-preserving manner. The extracted metadata is stored in a CSV file, along with the de-identified PNG images. The mapping of PNG files and their respective metadata is stored in a separate CSV file.
 
+## dicom-anonymization
+
+Converts a set of DICOM images into anonymized DICOM images, stripping off the PHI. 
+
 ## app-layer
 
 The app-layer (application layer) consists of specific algorithms. The app-layer/src/main/scripts consists of Javascript scripts such as scanner clock calibration. The app-layer/src/main/java consists of the the scanner utilization computation algorithms developed in Java.

From 03168977ce7d9173fb35eb179a71714d44cd0747 Mon Sep 17 00:00:00 2001
From: Pradeeban Kathiravelu <kk.pradeeban@gmail.com>
Date: Mon, 5 Apr 2021 16:44:54 -0400
Subject: [PATCH 13/20] Update ImageExtractor.py

Remove misleading log.
---
 modules/png-extraction/ImageExtractor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
index 5e22212..db57c8a 100644
--- a/modules/png-extraction/ImageExtractor.py
+++ b/modules/png-extraction/ImageExtractor.py
@@ -298,7 +298,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     pickle.dump(filelist,open(pickle_file,'wb'))
 file_chunks = np.array_split(filelist,no_splits)
 logging.info('Number of dicom files: ' + str(len(filelist)))
-logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) )
 
 try:
     ff = filelist[0] #load first file as a template to look at all

From a3bd039c4f35aa060b602256bab572ba1eb77906 Mon Sep 17 00:00:00 2001
From: Ramon Luis Correa Medero <rlcorre@emory.edu>
Date: Mon, 5 Apr 2021 17:05:39 -0400
Subject: [PATCH 14/20] updated cpu count call and changed how pool is managed

---
 modules/png-extraction/ImageExtractor.py | 27 ++++++++++++------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
index db57c8a..8d33b1e 100644
--- a/modules/png-extraction/ImageExtractor.py
+++ b/modules/png-extraction/ImageExtractor.py
@@ -1,26 +1,24 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-import numpy as np
-import pandas as pd
-import pydicom as dicom 
-import png, os, glob
-import PIL as pil
-from pprint import pprint
-import hashlib
+import os
+import glob 
 from shutil import copyfile
-import logging
-from multiprocessing import Pool
+import hashlib
 import json
 import sys
 import subprocess
+import logging
+from multiprocessing import Pool
 import pdb
+import time
 import pickle
+import numpy as np
+import pandas as pd
+import pydicom as dicom 
 #pydicom imports needed to handle data errrors
 from pydicom import config
 from pydicom import datadict
 from pydicom import values
-from subprocess import Popen
-import time
 
 with open('config.json', 'r') as f:
     niffler = json.load(f)
@@ -243,7 +241,7 @@ def fix_mismatch_callback(raw_elem, **kwargs):
                 pass
             else:
                 raw_elem = raw_elem._replace(VR=vr)
-                break  # I want to exit immediately after change is applied
+                break
     return raw_elem
 
 
@@ -298,6 +296,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     pickle.dump(filelist,open(pickle_file,'wb'))
 file_chunks = np.array_split(filelist,no_splits)
 logging.info('Number of dicom files: ' + str(len(filelist)))
+logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) )
 
 try:
     ff = filelist[0] #load first file as a template to look at all
@@ -346,7 +345,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
         filedata=data
         total = len(chunk)
         stamp = time.time()
-        p = Pool(os.cpu_count()) 
+        p = Pool(core_count) 
         res = p.imap_unordered(extract_images,range(len(filedata)))
         for out in res:
             (fmap,fail_path,err) = out
@@ -357,6 +356,8 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
                 logging.error(err_msg)
             else:
                 fm.write(fmap)
+        p.join()
+        p.close()
     fm.close()
     logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!')
 

From bf828388f911be950aa0d657ebcf58781c97430b Mon Sep 17 00:00:00 2001
From: Ramon Luis Correa Medero <rlcorre@emory.edu>
Date: Mon, 5 Apr 2021 17:09:17 -0400
Subject: [PATCH 15/20] removed hardcoded  log line

---
 modules/png-extraction/ImageExtractor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
index 8d33b1e..41f1d11 100644
--- a/modules/png-extraction/ImageExtractor.py
+++ b/modules/png-extraction/ImageExtractor.py
@@ -296,7 +296,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     pickle.dump(filelist,open(pickle_file,'wb'))
 file_chunks = np.array_split(filelist,no_splits)
 logging.info('Number of dicom files: ' + str(len(filelist)))
-logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) )
 
 try:
     ff = filelist[0] #load first file as a template to look at all

From 2d097329078316a2d283beb384eeaf671ae692c1 Mon Sep 17 00:00:00 2001
From: Ramon Luis Correa Medero <rlcorre@emory.edu>
Date: Tue, 6 Apr 2021 00:44:40 -0400
Subject: [PATCH 16/20] updates to error reporting, pool processing,

Pooling update:
1. Changed pooling to use  with satement instead of explicit calls to close and joins. The context manager handles this better than i would.
2.  With statement results in better error reporting for edge cases.

Strange print statement is preserved as i run more test. Uploading here for record purposes
---
 modules/png-extraction/ImageExtractor.py | 32 +++++++++++++-----------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
index 41f1d11..d95a05d 100644
--- a/modules/png-extraction/ImageExtractor.py
+++ b/modules/png-extraction/ImageExtractor.py
@@ -15,6 +15,7 @@
 import numpy as np
 import pandas as pd
 import pydicom as dicom 
+import png 
 #pydicom imports needed to handle data errrors
 from pydicom import config
 from pydicom import datadict
@@ -163,8 +164,7 @@ def extract_images(i):
             folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
                          hashlib.sha224(ID2.encode('utf-8')).hexdigest()
             #check for existence of the folder tree patient/study/series. Create if it does not exist.
-            if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
-                os.makedirs(png_destination + folderName)
+            os.makedirs(png_destination + folderName,exist_ok=True)
         else:
             ID1=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
             try:
@@ -220,6 +220,9 @@ def extract_images(i):
     except BaseException as error:
         found_err = error
         logging.error(found_err)
+        print('---pokemon--')
+        print(error)
+        print(found_err)
         fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
     except Exception as error:
         found_err = error
@@ -296,6 +299,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     pickle.dump(filelist,open(pickle_file,'wb'))
 file_chunks = np.array_split(filelist,no_splits)
 logging.info('Number of dicom files: ' + str(len(filelist)))
+logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) )
 
 try:
     ff = filelist[0] #load first file as a template to look at all
@@ -344,19 +348,17 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
         filedata=data
         total = len(chunk)
         stamp = time.time()
-        p = Pool(core_count) 
-        res = p.imap_unordered(extract_images,range(len(filedata)))
-        for out in res:
-            (fmap,fail_path,err) = out
-            if err:
-                count +=1
-                copyfile(fail_path[0],fail_path[1])
-                err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
-                logging.error(err_msg)
-            else:
-                fm.write(fmap)
-        p.join()
-        p.close()
+        with Pool(core_count) as p:
+            res = p.imap_unordered(extract_images,range(len(filedata)))
+            for out in res:
+                (fmap,fail_path,err) = out
+                if err:
+                    count +=1
+                    copyfile(fail_path[0],fail_path[1])
+                    err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
+                    logging.error(err_msg)
+                else:
+                    fm.write(fmap)
     fm.close()
     logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!')
 

From b30fda265f08123868b573c4fc6f19de4de131f6 Mon Sep 17 00:00:00 2001
From: Ramon Luis Correa Medero <rlcorre@emory.edu>
Date: Tue, 6 Apr 2021 12:15:02 -0400
Subject: [PATCH 17/20] remove all makedir calls

---
 modules/ImageExtractor.py | 396 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 396 insertions(+)
 create mode 100644 modules/ImageExtractor.py

diff --git a/modules/ImageExtractor.py b/modules/ImageExtractor.py
new file mode 100644
index 0000000..3422f5f
--- /dev/null
+++ b/modules/ImageExtractor.py
@@ -0,0 +1,396 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import glob 
+from shutil import copyfile
+import hashlib
+import json
+import sys
+import subprocess
+import logging
+from multiprocessing import Pool
+import pdb
+import time
+import pickle
+import numpy as np
+import pandas as pd
+import pydicom as dicom 
+import png 
+#pydicom imports needed to handle data errrors
+from pydicom import config
+from pydicom import datadict
+from pydicom import values
+
+with open('config.json', 'r') as f:
+    niffler = json.load(f)
+
+#Get variables for StoreScp from config.json.
+print_images = niffler['PrintImages']
+print_only_common_headers = niffler['CommonHeadersOnly']
+dicom_home = niffler['DICOMHome'] #the folder containing your dicom files
+output_directory = niffler['OutputDirectory']
+depth = niffler['Depth']
+processes = niffler['UseProcesses'] #how many processes to use.
+flattened_to_level = niffler['FlattenedToLevel']
+email = niffler['YourEmail']
+send_email = niffler['SendEmail']
+no_splits = niffler['SplitIntoChunks']
+is16Bit = niffler['is16Bit']
+
+png_destination = output_directory + '/extracted-images/'
+failed = output_directory +'/failed-dicom/'
+maps_directory = output_directory + '/maps/'
+meta_directory = output_directory + '/meta/'
+
+LOG_FILENAME = output_directory + '/ImageExtractor.out'
+pickle_file = output_directory + '/ImageExtractor.pickle'
+# record the start time
+t_start = time.time()
+
+if not os.path.exists(output_directory):
+    os.makedirs(output_directory)
+
+logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
+
+if not os.path.exists(maps_directory):
+    os.makedirs(maps_directory)
+
+if not os.path.exists(meta_directory):
+    os.makedirs(meta_directory)
+
+if not os.path.exists(png_destination):
+    os.makedirs(png_destination)
+
+if not os.path.exists(failed):
+    os.makedirs(failed)
+
+if not os.path.exists(failed + "/1"):
+    os.makedirs(failed + "/1")
+
+if not os.path.exists(failed + "/2"):
+    os.makedirs(failed + "/2")
+
+if not os.path.exists(failed + "/3"):
+    os.makedirs(failed + "/3")
+
+if not os.path.exists(failed + "/4"):
+    os.makedirs(failed + "/4")
+
+#%%Function for getting tuple for field,val pairs
+def get_tuples(plan, outlist = None, key = ""):
+    if len(key)>0:
+        key =  key + "_"
+    if not outlist:
+        outlist = []
+    for aa  in plan.dir():
+        try:
+            hasattr(plan,aa)
+        except TypeError as e:
+            logging.warning('Type Error encountered')
+        if (hasattr(plan, aa) and aa!='PixelData'):
+            value = getattr(plan, aa)
+            start = len(outlist)
+            #if dicom sequence extract tags from each element
+            if type(value) is dicom.sequence.Sequence:
+                for nn, ss in enumerate(list(value)):
+                    newkey = "_".join([key,("%d"%nn),aa]) if len(key) else "_".join([("%d"%nn),aa])
+                    candidate = get_tuples(ss,outlist=None,key=newkey)
+                    #if extracted tuples are too big condense to a string
+                    if len(candidate)>2000:
+                        outlist.append((newkey,str(candidate)))
+                    else:
+                        outlist.extend(candidate)
+            else:
+                if type(value) is dicom.valuerep.DSfloat:
+                    value = float(value)
+                elif type(value) is dicom.valuerep.IS:
+                    value = str(value)
+                elif type(value) is dicom.valuerep.MultiValue:
+                    value = tuple(value)
+                elif type(value) is dicom.uid.UID:
+                    value = str(value)
+                outlist.append((key + aa, value)) #appends name, value pair for this file. these are later concatenated to the dataframe
+    return outlist
+
+
+def extract_headers(f_list_elem):
+    nn,ff = f_list_elem # unpack enumerated list
+    plan = dicom.dcmread(ff, force=True)  #reads in dicom file
+    #checks if this file has an image
+    c=True
+    try:
+        check=plan.pixel_array #throws error if dicom file has no image
+    except:
+        c = False
+    kv = get_tuples(plan)       #gets tuple for field,val pairs for this file. function defined above
+    # dicom images should not have more than 300
+    if len(kv)>500:
+        logging.debug(str(len(kv)) + " dicoms produced by " + ff)
+    kv.append(('file',chunk[nn])) #adds my custom field with the original filepath
+    kv.append(('has_pix_array',c))   #adds my custom field with if file has image
+    if c:
+        kv.append(('category','uncategorized')) #adds my custom category field - useful if classifying images before processing
+    else:
+        kv.append(('category','no image'))      #adds my custom category field, makes note as imageless
+    return dict(kv)
+
+#%%Function to extract pixel array information
+#takes an integer used to index into the global filedata dataframe
+#returns tuple of
+# filemapping: dicom to png paths   (as str)
+# fail_path: dicom to failed folder (as tuple)
+# found_err: error code produced when processing
+def extract_images(i):
+    ds = dicom.dcmread(filedata.iloc[i].loc['file'], force=True) #read file in
+    found_err=None
+    filemapping = ""
+    fail_path = ""
+    try:
+        im=ds.pixel_array #pull image from read dicom
+        imName=os.path.split(filedata.iloc[i].loc['file'])[1][:-4] #get file name ex: IM-0107-0022
+
+        if flattened_to_level == 'patient':
+            ID=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
+            folderName = hashlib.sha224(ID.encode('utf-8')).hexdigest()
+            #check for existence of patient folder. Create if it does not exist.
+            os.makedirs(png_destination + folderName,exist_ok=True)
+        elif flattened_to_level == 'study':
+            ID1=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
+            try:
+                ID2=filedata.iloc[i].loc['StudyInstanceUID']  # Unique identifier for the Study.
+            except:
+                ID2='ALL-STUDIES'
+            folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
+                         hashlib.sha224(ID2.encode('utf-8')).hexdigest()
+            #check for existence of the folder tree patient/study/series. Create if it does not exist.
+            os.makedirs(png_destination + folderName,exist_ok=True)
+        else:
+            ID1=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
+            try:
+                ID2=filedata.iloc[i].loc['StudyInstanceUID']  # Unique identifier for the Study.
+                ID3=filedata.iloc[i].loc['SeriesInstanceUID']  # Unique identifier of the Series.
+            except:
+                ID2='ALL-STUDIES'
+                ID3='ALL-SERIES'
+            folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
+                         hashlib.sha224(ID2.encode('utf-8')).hexdigest() + "/" + hashlib.sha224(ID3.encode('utf-8')).hexdigest()
+            #check for existence of the folder tree patient/study/series. Create if it does not exist.
+            os.makedirs(png_destination + folderName,exist_ok=True)
+
+
+        pngfile = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
+        dicom_path = filedata.iloc[i].loc['file']
+        image_path = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
+        if is16Bit: 
+            # write the PNG file as a 16-bit greyscale 
+            image_2d = ds.pixel_array.astype(np.double) 
+            # # Rescaling grey scale between 0-255
+            image_2d_scaled =  (np.maximum(image_2d,0) / image_2d.max()) * 65535.0  
+            # # Convert to uint
+            shape = ds.pixel_array.shape
+            image_2d_scaled = np.uint16(image_2d_scaled) 
+            with open(pngfile , 'wb') as png_file:
+                    w = png.Writer(shape[1], shape[0], greyscale=True,bitdepth=16)
+                    w.write(png_file, image_2d_scaled)
+        else: 
+            shape = ds.pixel_array.shape
+            # # Convert to float to avoid overflow or underflow losses.
+            image_2d = ds.pixel_array.astype(float)
+            #
+            # # Rescaling grey scale between 0-255
+            image_2d_scaled = (np.maximum(image_2d,0) / image_2d.max()) * 255.0
+            #
+            # # Convert to uint
+            image_2d_scaled = np.uint8(image_2d_scaled)
+        # # Write the PNG file
+            with open(pngfile , 'wb') as png_file:
+                    w = png.Writer(shape[1], shape[0], greyscale=True)
+                    w.write(png_file, image_2d_scaled)
+        filemapping = filedata.iloc[i].loc['file'] + ', ' + pngfile + '\n'
+    except AttributeError as error:
+        found_err = error
+        logging.error(found_err)
+        fail_path = filedata.iloc[i].loc['file'], failed + '1/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
+    except ValueError as error:
+        found_err = error
+        logging.error(found_err)
+        fail_path = filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
+    except BaseException as error:
+        found_err = error
+        logging.error(found_err)
+        fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
+    except Exception as error:
+        found_err = error
+        logging.error(found_err)
+        fail_path = filedata.iloc[i].loc['file'], failed + '4/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
+    return (filemapping,fail_path,found_err)
+
+
+#%%Function when pydicom fails to read a value attempt to read as
+#other types.
+def fix_mismatch_callback(raw_elem, **kwargs):
+    try:
+        values.convert_value(raw_elem.VR, raw_elem)
+    except TypeError:
+        for vr in kwargs['with_VRs']:
+            try:
+                values.convert_value(vr, raw_elem)
+            except TypeError:
+                pass
+            else:
+                raw_elem = raw_elem._replace(VR=vr)
+                break
+    return raw_elem
+
+
+def get_path(depth):
+    directory = dicom_home + '/'
+    i = 0
+    while i < depth:
+        directory += "*/"
+        i += 1
+    return directory + "*.dcm"
+
+#%%Function used by pydicom.
+def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
+    """A callback function to check that RawDataElements are translatable
+    with their provided VRs.  If not, re-attempt translation using
+    some other translators.
+    Parameters
+    ----------
+    with_VRs : list, [['PN', 'DS', 'IS']]
+        A list of VR strings to attempt if the raw data element value cannot
+        be translated with the raw data element's VR.
+    Returns
+    -------
+    No return value.  The callback function will return either
+    the original RawDataElement instance, or one with a fixed VR.
+    """
+    dicom.config.data_element_callback = fix_mismatch_callback
+    config.data_element_callback_kwargs = {
+        'with_VRs': with_VRs,
+    }
+    
+fix_mismatch()
+if processes == 0.5:  # use half the cores to avoid  high ram usage
+    core_count = int(os.cpu_count()/2)
+elif processes == 0:  # use all the cores
+    core_count = int(os.cpu_count())
+elif processes < os.cpu_count():  # use the specified number of cores to avoid high ram usage
+    core_count = processes
+else:
+    core_count = int(os.cpu_count())
+#%% get set up to create dataframe
+dirs = os.listdir(dicom_home)
+#gets all dicom files. if editing this code, get filelist into the format of a list of strings,
+#with each string as the file path to a different dicom file.
+file_path = get_path(depth)
+
+if os.path.isfile(pickle_file):
+    f=open(pickle_file,'rb')
+    filelist=pickle.load(f)
+else:
+    filelist=glob.glob(file_path, recursive=True) #this searches the folders at the depth we request and finds all dicoms
+    pickle.dump(filelist,open(pickle_file,'wb'))
+file_chunks = np.array_split(filelist,no_splits)
+logging.info('Number of dicom files: ' + str(len(filelist)))
+logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) )
+
+try:
+    ff = filelist[0] #load first file as a template to look at all
+except IndexError:
+    logging.error("There is no file present in the given folder in " + file_path)
+    sys.exit(1)
+
+plan = dicom.dcmread(ff, force=True)
+logging.debug('Loaded the first file successfully')
+
+keys = [(aa) for aa in plan.dir() if (hasattr(plan, aa) and aa!='PixelData')]
+#%%checks for images in fields and prints where they are
+for field in plan.dir():
+    if (hasattr(plan, field) and field!='PixelData'):
+        entry = getattr(plan, field)
+        if type(entry) is bytes:
+            logging.debug(field)
+            logging.debug(str(entry))
+for i,chunk in enumerate(file_chunks):
+    csv_destination = "{}/meta/metadata_{}.csv".format(output_directory,i)
+    mappings ="{}/maps/mapping_{}.csv".format(output_directory,i)
+    fm = open(mappings, "w+")
+    filemapping = 'Original DICOM file location, PNG location \n'
+    fm.write(filemapping)
+    # add a check to see if the metadata has already been extracted
+    #%%step through whole file list, read in file, append fields to future dataframe of all files
+    headerlist = []
+    #start up a multi processing pool
+    #for every item in filelist send data to a subprocess and run extract_headers func
+    #output is then added to headerlist as they are completed (no ordering is done)
+    with Pool(core_count) as p:
+        res= p.imap_unordered(extract_headers,enumerate(chunk))
+        for i,e in enumerate(res):
+            headerlist.append(e)
+    data = pd.DataFrame(headerlist)
+    logging.info('Chunk ' + str(i) + ' Number of fields per file : ' + str(len(data.columns)))
+    #%%find common fields
+    #make dataframe containing all fields and all files minus those removed in previous block
+    #%%export csv file of final dataframe
+    export_csv = data.to_csv (csv_destination, index = None, header=True)
+    fields=data.keys()
+    count = 0 #potential painpoint
+    #writting of log handled by main process
+    if print_images:
+        logging.info("Start processing Images")
+        filedata=data
+        total = len(chunk)
+        stamp = time.time()
+        with Pool(core_count) as p:
+            res = p.imap_unordered(extract_images,range(len(filedata)))
+            for out in res:
+                (fmap,fail_path,err) = out
+                if err:
+                    count +=1
+                    copyfile(fail_path[0],fail_path[1])
+                    err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
+                    logging.error(err_msg)
+                else:
+                    fm.write(fmap)
+    fm.close()
+    logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!')
+
+
+logging.info('Generating final metadata file')
+
+#identify the 
+col_names=  set()
+metas = glob.glob( "{}*.csv".format(meta_directory))
+#for each meta  file identify the columns that are not na's for 90% of data 
+for meta in metas:
+    m = pd.read_csv(meta,dtype='str')
+    d_len = m.shape[0]
+    interest_names = [e for e in m.columns  if  ( (m[e]. isna()==True).sum() /d_len ) <.9  ]  #count if percentage > .9 
+    col_names.update(interest_names)
+#load every metadata file using only valid columns 
+meta_list = list()
+for meta in metas:
+    m = pd.read_csv(meta,dtype='str',usecols=col_names)
+    meta_list.append(m)
+merged_meta = pd.concat(meta_list,ignore_index=True)
+merged_meta.to_csv('{}/metadata.csv'.format(output_directory),index=False)
+#getting a single mapping file
+logging.info('Generatign final mapping file')
+mappings = glob.glob("{}/maps/*.csv".format(output_directory))
+map_list = list()
+for mapping in mappings:
+    map_list.append(pd.read_csv(mapping,dtype='str'))
+merged_maps = pd.concat(map_list,ignore_index=True)
+if print_only_common_headers:
+    mask_common_fields = merged_maps.isnull().mean() < 0.1
+    common_fields = set(np.asarray(merged_maps.columns)[mask_common_fields])
+    merged_maps = merged_maps[common_fields]
+merged_maps.to_csv('{}/mapping.csv'.format(output_directory),index=False)
+
+
+if send_email:
+    subprocess.call('echo "Niffler has successfully completed the png conversion" | mail -s "The image conversion has been complete" {0}'.format(email), shell=True)
+# Record the total run-time
+logging.info('Total run time: %s %s', time.time() - t_start, ' seconds!')

From 19fbe867ba9e0d5c8f51c38edad40a2569eb3d48 Mon Sep 17 00:00:00 2001
From: Ramon Luis Correa Medero <rlcorre@emory.edu>
Date: Tue, 6 Apr 2021 12:16:56 -0400
Subject: [PATCH 18/20] remove makedir calls

---
 modules/png-extraction/ImageExtractor.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
index d95a05d..3422f5f 100644
--- a/modules/png-extraction/ImageExtractor.py
+++ b/modules/png-extraction/ImageExtractor.py
@@ -153,8 +153,7 @@ def extract_images(i):
             ID=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
             folderName = hashlib.sha224(ID.encode('utf-8')).hexdigest()
             #check for existence of patient folder. Create if it does not exist.
-            if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
-                os.mkdir(png_destination + folderName)
+            os.makedirs(png_destination + folderName,exist_ok=True)
         elif flattened_to_level == 'study':
             ID1=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
             try:
@@ -176,8 +175,7 @@ def extract_images(i):
             folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
                          hashlib.sha224(ID2.encode('utf-8')).hexdigest() + "/" + hashlib.sha224(ID3.encode('utf-8')).hexdigest()
             #check for existence of the folder tree patient/study/series. Create if it does not exist.
-            if not (os.path.exists(png_destination + folderName)): # it is completely possible for multiple proceses to run this check at same time.
-                os.makedirs(png_destination + folderName)
+            os.makedirs(png_destination + folderName,exist_ok=True)
 
 
         pngfile = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
@@ -220,9 +218,6 @@ def extract_images(i):
     except BaseException as error:
         found_err = error
         logging.error(found_err)
-        print('---pokemon--')
-        print(error)
-        print(found_err)
         fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
     except Exception as error:
         found_err = error

From 7cc2832a1e46d26ab7d3466b188e28907041e2de Mon Sep 17 00:00:00 2001
From: Ramon Luis Correa Medero <rlcorre@emory.edu>
Date: Tue, 6 Apr 2021 12:17:28 -0400
Subject: [PATCH 19/20] remove accidental pussh to wrong dir

---
 modules/ImageExtractor.py | 396 --------------------------------------
 1 file changed, 396 deletions(-)
 delete mode 100644 modules/ImageExtractor.py

diff --git a/modules/ImageExtractor.py b/modules/ImageExtractor.py
deleted file mode 100644
index 3422f5f..0000000
--- a/modules/ImageExtractor.py
+++ /dev/null
@@ -1,396 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-import os
-import glob 
-from shutil import copyfile
-import hashlib
-import json
-import sys
-import subprocess
-import logging
-from multiprocessing import Pool
-import pdb
-import time
-import pickle
-import numpy as np
-import pandas as pd
-import pydicom as dicom 
-import png 
-#pydicom imports needed to handle data errrors
-from pydicom import config
-from pydicom import datadict
-from pydicom import values
-
-with open('config.json', 'r') as f:
-    niffler = json.load(f)
-
-#Get variables for StoreScp from config.json.
-print_images = niffler['PrintImages']
-print_only_common_headers = niffler['CommonHeadersOnly']
-dicom_home = niffler['DICOMHome'] #the folder containing your dicom files
-output_directory = niffler['OutputDirectory']
-depth = niffler['Depth']
-processes = niffler['UseProcesses'] #how many processes to use.
-flattened_to_level = niffler['FlattenedToLevel']
-email = niffler['YourEmail']
-send_email = niffler['SendEmail']
-no_splits = niffler['SplitIntoChunks']
-is16Bit = niffler['is16Bit']
-
-png_destination = output_directory + '/extracted-images/'
-failed = output_directory +'/failed-dicom/'
-maps_directory = output_directory + '/maps/'
-meta_directory = output_directory + '/meta/'
-
-LOG_FILENAME = output_directory + '/ImageExtractor.out'
-pickle_file = output_directory + '/ImageExtractor.pickle'
-# record the start time
-t_start = time.time()
-
-if not os.path.exists(output_directory):
-    os.makedirs(output_directory)
-
-logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
-
-if not os.path.exists(maps_directory):
-    os.makedirs(maps_directory)
-
-if not os.path.exists(meta_directory):
-    os.makedirs(meta_directory)
-
-if not os.path.exists(png_destination):
-    os.makedirs(png_destination)
-
-if not os.path.exists(failed):
-    os.makedirs(failed)
-
-if not os.path.exists(failed + "/1"):
-    os.makedirs(failed + "/1")
-
-if not os.path.exists(failed + "/2"):
-    os.makedirs(failed + "/2")
-
-if not os.path.exists(failed + "/3"):
-    os.makedirs(failed + "/3")
-
-if not os.path.exists(failed + "/4"):
-    os.makedirs(failed + "/4")
-
-#%%Function for getting tuple for field,val pairs
-def get_tuples(plan, outlist = None, key = ""):
-    if len(key)>0:
-        key =  key + "_"
-    if not outlist:
-        outlist = []
-    for aa  in plan.dir():
-        try:
-            hasattr(plan,aa)
-        except TypeError as e:
-            logging.warning('Type Error encountered')
-        if (hasattr(plan, aa) and aa!='PixelData'):
-            value = getattr(plan, aa)
-            start = len(outlist)
-            #if dicom sequence extract tags from each element
-            if type(value) is dicom.sequence.Sequence:
-                for nn, ss in enumerate(list(value)):
-                    newkey = "_".join([key,("%d"%nn),aa]) if len(key) else "_".join([("%d"%nn),aa])
-                    candidate = get_tuples(ss,outlist=None,key=newkey)
-                    #if extracted tuples are too big condense to a string
-                    if len(candidate)>2000:
-                        outlist.append((newkey,str(candidate)))
-                    else:
-                        outlist.extend(candidate)
-            else:
-                if type(value) is dicom.valuerep.DSfloat:
-                    value = float(value)
-                elif type(value) is dicom.valuerep.IS:
-                    value = str(value)
-                elif type(value) is dicom.valuerep.MultiValue:
-                    value = tuple(value)
-                elif type(value) is dicom.uid.UID:
-                    value = str(value)
-                outlist.append((key + aa, value)) #appends name, value pair for this file. these are later concatenated to the dataframe
-    return outlist
-
-
-def extract_headers(f_list_elem):
-    nn,ff = f_list_elem # unpack enumerated list
-    plan = dicom.dcmread(ff, force=True)  #reads in dicom file
-    #checks if this file has an image
-    c=True
-    try:
-        check=plan.pixel_array #throws error if dicom file has no image
-    except:
-        c = False
-    kv = get_tuples(plan)       #gets tuple for field,val pairs for this file. function defined above
-    # dicom images should not have more than 300
-    if len(kv)>500:
-        logging.debug(str(len(kv)) + " dicoms produced by " + ff)
-    kv.append(('file',chunk[nn])) #adds my custom field with the original filepath
-    kv.append(('has_pix_array',c))   #adds my custom field with if file has image
-    if c:
-        kv.append(('category','uncategorized')) #adds my custom category field - useful if classifying images before processing
-    else:
-        kv.append(('category','no image'))      #adds my custom category field, makes note as imageless
-    return dict(kv)
-
-#%%Function to extract pixel array information
-#takes an integer used to index into the global filedata dataframe
-#returns tuple of
-# filemapping: dicom to png paths   (as str)
-# fail_path: dicom to failed folder (as tuple)
-# found_err: error code produced when processing
-def extract_images(i):
-    ds = dicom.dcmread(filedata.iloc[i].loc['file'], force=True) #read file in
-    found_err=None
-    filemapping = ""
-    fail_path = ""
-    try:
-        im=ds.pixel_array #pull image from read dicom
-        imName=os.path.split(filedata.iloc[i].loc['file'])[1][:-4] #get file name ex: IM-0107-0022
-
-        if flattened_to_level == 'patient':
-            ID=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
-            folderName = hashlib.sha224(ID.encode('utf-8')).hexdigest()
-            #check for existence of patient folder. Create if it does not exist.
-            os.makedirs(png_destination + folderName,exist_ok=True)
-        elif flattened_to_level == 'study':
-            ID1=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
-            try:
-                ID2=filedata.iloc[i].loc['StudyInstanceUID']  # Unique identifier for the Study.
-            except:
-                ID2='ALL-STUDIES'
-            folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
-                         hashlib.sha224(ID2.encode('utf-8')).hexdigest()
-            #check for existence of the folder tree patient/study/series. Create if it does not exist.
-            os.makedirs(png_destination + folderName,exist_ok=True)
-        else:
-            ID1=filedata.iloc[i].loc['PatientID']  # Unique identifier for the Patient.
-            try:
-                ID2=filedata.iloc[i].loc['StudyInstanceUID']  # Unique identifier for the Study.
-                ID3=filedata.iloc[i].loc['SeriesInstanceUID']  # Unique identifier of the Series.
-            except:
-                ID2='ALL-STUDIES'
-                ID3='ALL-SERIES'
-            folderName = hashlib.sha224(ID1.encode('utf-8')).hexdigest() + "/" + \
-                         hashlib.sha224(ID2.encode('utf-8')).hexdigest() + "/" + hashlib.sha224(ID3.encode('utf-8')).hexdigest()
-            #check for existence of the folder tree patient/study/series. Create if it does not exist.
-            os.makedirs(png_destination + folderName,exist_ok=True)
-
-
-        pngfile = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
-        dicom_path = filedata.iloc[i].loc['file']
-        image_path = png_destination+folderName+'/' + hashlib.sha224(imName.encode('utf-8')).hexdigest() + '.png'
-        if is16Bit: 
-            # write the PNG file as a 16-bit greyscale 
-            image_2d = ds.pixel_array.astype(np.double) 
-            # # Rescaling grey scale between 0-255
-            image_2d_scaled =  (np.maximum(image_2d,0) / image_2d.max()) * 65535.0  
-            # # Convert to uint
-            shape = ds.pixel_array.shape
-            image_2d_scaled = np.uint16(image_2d_scaled) 
-            with open(pngfile , 'wb') as png_file:
-                    w = png.Writer(shape[1], shape[0], greyscale=True,bitdepth=16)
-                    w.write(png_file, image_2d_scaled)
-        else: 
-            shape = ds.pixel_array.shape
-            # # Convert to float to avoid overflow or underflow losses.
-            image_2d = ds.pixel_array.astype(float)
-            #
-            # # Rescaling grey scale between 0-255
-            image_2d_scaled = (np.maximum(image_2d,0) / image_2d.max()) * 255.0
-            #
-            # # Convert to uint
-            image_2d_scaled = np.uint8(image_2d_scaled)
-        # # Write the PNG file
-            with open(pngfile , 'wb') as png_file:
-                    w = png.Writer(shape[1], shape[0], greyscale=True)
-                    w.write(png_file, image_2d_scaled)
-        filemapping = filedata.iloc[i].loc['file'] + ', ' + pngfile + '\n'
-    except AttributeError as error:
-        found_err = error
-        logging.error(found_err)
-        fail_path = filedata.iloc[i].loc['file'], failed + '1/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
-    except ValueError as error:
-        found_err = error
-        logging.error(found_err)
-        fail_path = filedata.iloc[i].loc['file'], failed + '2/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
-    except BaseException as error:
-        found_err = error
-        logging.error(found_err)
-        fail_path = filedata.iloc[i].loc['file'], failed + '3/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
-    except Exception as error:
-        found_err = error
-        logging.error(found_err)
-        fail_path = filedata.iloc[i].loc['file'], failed + '4/' + os.path.split(filedata.iloc[i].loc['file'])[1][:-4]+'.dcm'
-    return (filemapping,fail_path,found_err)
-
-
-#%%Function when pydicom fails to read a value attempt to read as
-#other types.
-def fix_mismatch_callback(raw_elem, **kwargs):
-    try:
-        values.convert_value(raw_elem.VR, raw_elem)
-    except TypeError:
-        for vr in kwargs['with_VRs']:
-            try:
-                values.convert_value(vr, raw_elem)
-            except TypeError:
-                pass
-            else:
-                raw_elem = raw_elem._replace(VR=vr)
-                break
-    return raw_elem
-
-
-def get_path(depth):
-    directory = dicom_home + '/'
-    i = 0
-    while i < depth:
-        directory += "*/"
-        i += 1
-    return directory + "*.dcm"
-
-#%%Function used by pydicom.
-def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
-    """A callback function to check that RawDataElements are translatable
-    with their provided VRs.  If not, re-attempt translation using
-    some other translators.
-    Parameters
-    ----------
-    with_VRs : list, [['PN', 'DS', 'IS']]
-        A list of VR strings to attempt if the raw data element value cannot
-        be translated with the raw data element's VR.
-    Returns
-    -------
-    No return value.  The callback function will return either
-    the original RawDataElement instance, or one with a fixed VR.
-    """
-    dicom.config.data_element_callback = fix_mismatch_callback
-    config.data_element_callback_kwargs = {
-        'with_VRs': with_VRs,
-    }
-    
-fix_mismatch()
-if processes == 0.5:  # use half the cores to avoid  high ram usage
-    core_count = int(os.cpu_count()/2)
-elif processes == 0:  # use all the cores
-    core_count = int(os.cpu_count())
-elif processes < os.cpu_count():  # use the specified number of cores to avoid high ram usage
-    core_count = processes
-else:
-    core_count = int(os.cpu_count())
-#%% get set up to create dataframe
-dirs = os.listdir(dicom_home)
-#gets all dicom files. if editing this code, get filelist into the format of a list of strings,
-#with each string as the file path to a different dicom file.
-file_path = get_path(depth)
-
-if os.path.isfile(pickle_file):
-    f=open(pickle_file,'rb')
-    filelist=pickle.load(f)
-else:
-    filelist=glob.glob(file_path, recursive=True) #this searches the folders at the depth we request and finds all dicoms
-    pickle.dump(filelist,open(pickle_file,'wb'))
-file_chunks = np.array_split(filelist,no_splits)
-logging.info('Number of dicom files: ' + str(len(filelist)))
-logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) )
-
-try:
-    ff = filelist[0] #load first file as a template to look at all
-except IndexError:
-    logging.error("There is no file present in the given folder in " + file_path)
-    sys.exit(1)
-
-plan = dicom.dcmread(ff, force=True)
-logging.debug('Loaded the first file successfully')
-
-keys = [(aa) for aa in plan.dir() if (hasattr(plan, aa) and aa!='PixelData')]
-#%%checks for images in fields and prints where they are
-for field in plan.dir():
-    if (hasattr(plan, field) and field!='PixelData'):
-        entry = getattr(plan, field)
-        if type(entry) is bytes:
-            logging.debug(field)
-            logging.debug(str(entry))
-for i,chunk in enumerate(file_chunks):
-    csv_destination = "{}/meta/metadata_{}.csv".format(output_directory,i)
-    mappings ="{}/maps/mapping_{}.csv".format(output_directory,i)
-    fm = open(mappings, "w+")
-    filemapping = 'Original DICOM file location, PNG location \n'
-    fm.write(filemapping)
-    # add a check to see if the metadata has already been extracted
-    #%%step through whole file list, read in file, append fields to future dataframe of all files
-    headerlist = []
-    #start up a multi processing pool
-    #for every item in filelist send data to a subprocess and run extract_headers func
-    #output is then added to headerlist as they are completed (no ordering is done)
-    with Pool(core_count) as p:
-        res= p.imap_unordered(extract_headers,enumerate(chunk))
-        for i,e in enumerate(res):
-            headerlist.append(e)
-    data = pd.DataFrame(headerlist)
-    logging.info('Chunk ' + str(i) + ' Number of fields per file : ' + str(len(data.columns)))
-    #%%find common fields
-    #make dataframe containing all fields and all files minus those removed in previous block
-    #%%export csv file of final dataframe
-    export_csv = data.to_csv (csv_destination, index = None, header=True)
-    fields=data.keys()
-    count = 0 #potential painpoint
-    #writting of log handled by main process
-    if print_images:
-        logging.info("Start processing Images")
-        filedata=data
-        total = len(chunk)
-        stamp = time.time()
-        with Pool(core_count) as p:
-            res = p.imap_unordered(extract_images,range(len(filedata)))
-            for out in res:
-                (fmap,fail_path,err) = out
-                if err:
-                    count +=1
-                    copyfile(fail_path[0],fail_path[1])
-                    err_msg = str(count) + ' out of ' + str(len(chunk)) + ' dicom images have failed extraction'
-                    logging.error(err_msg)
-                else:
-                    fm.write(fmap)
-    fm.close()
-    logging.info('Chunk run time: %s %s', time.time() - t_start, ' seconds!')
-
-
-logging.info('Generating final metadata file')
-
-#identify the 
-col_names=  set()
-metas = glob.glob( "{}*.csv".format(meta_directory))
-#for each meta  file identify the columns that are not na's for 90% of data 
-for meta in metas:
-    m = pd.read_csv(meta,dtype='str')
-    d_len = m.shape[0]
-    interest_names = [e for e in m.columns  if  ( (m[e]. isna()==True).sum() /d_len ) <.9  ]  #count if percentage > .9 
-    col_names.update(interest_names)
-#load every metadata file using only valid columns 
-meta_list = list()
-for meta in metas:
-    m = pd.read_csv(meta,dtype='str',usecols=col_names)
-    meta_list.append(m)
-merged_meta = pd.concat(meta_list,ignore_index=True)
-merged_meta.to_csv('{}/metadata.csv'.format(output_directory),index=False)
-#getting a single mapping file
-logging.info('Generatign final mapping file')
-mappings = glob.glob("{}/maps/*.csv".format(output_directory))
-map_list = list()
-for mapping in mappings:
-    map_list.append(pd.read_csv(mapping,dtype='str'))
-merged_maps = pd.concat(map_list,ignore_index=True)
-if print_only_common_headers:
-    mask_common_fields = merged_maps.isnull().mean() < 0.1
-    common_fields = set(np.asarray(merged_maps.columns)[mask_common_fields])
-    merged_maps = merged_maps[common_fields]
-merged_maps.to_csv('{}/mapping.csv'.format(output_directory),index=False)
-
-
-if send_email:
-    subprocess.call('echo "Niffler has successfully completed the png conversion" | mail -s "The image conversion has been complete" {0}'.format(email), shell=True)
-# Record the total run-time
-logging.info('Total run time: %s %s', time.time() - t_start, ' seconds!')

From d4de24c4b7f5437f24cdf8d15effe98744dcd986 Mon Sep 17 00:00:00 2001
From: Ramon Luis Correa Medero <rlcorre@emory.edu>
Date: Tue, 6 Apr 2021 12:20:23 -0400
Subject: [PATCH 20/20] remove erroneous logging

---
 modules/png-extraction/ImageExtractor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
index 3422f5f..909dbad 100644
--- a/modules/png-extraction/ImageExtractor.py
+++ b/modules/png-extraction/ImageExtractor.py
@@ -294,7 +294,6 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS']):
     pickle.dump(filelist,open(pickle_file,'wb'))
 file_chunks = np.array_split(filelist,no_splits)
 logging.info('Number of dicom files: ' + str(len(filelist)))
-logging.info('Number of chunks is 100 with size ' + str(len(file_chunks[0])) )
 
 try:
     ff = filelist[0] #load first file as a template to look at all