From 03f70b7df6a915f58e77e7ffb6cd56d31dd55aa5 Mon Sep 17 00:00:00 2001
From: pavan kumar bellam <pavan94411@gmail.com>
Date: Wed, 23 Mar 2022 06:48:29 +0530
Subject: [PATCH 1/5] supporting extraction of only specific tasks

---
 modules/png-extraction/ImageExtractor.py | 88 +++++++++++++++++-------
 modules/png-extraction/config.json       |  4 +-
 modules/png-extraction/featureset.txt    |  0
 3 files changed, 68 insertions(+), 24 deletions(-)
 create mode 100644 modules/png-extraction/featureset.txt

diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
index 84f6a40..f1827eb 100644
--- a/modules/png-extraction/ImageExtractor.py
+++ b/modules/png-extraction/ImageExtractor.py
@@ -38,7 +38,10 @@ def initialize_config_and_execute(config_values):
 
     print_images = bool(configs['PrintImages'])
     print_only_common_headers = bool(configs['CommonHeadersOnly'])
-    print_only_public_headers = bool(configs['PublicHeadersOnly'])
+    global public_headers_bool
+    public_headers_bool = bool(configs['PublicHeadersOnly'])
+    global SpecificHeadersOnly
+    SpecificHeadersOnly = bool(configs['SpecificHeadersOnly'])
     depth = int(configs['Depth'])
     processes = int(configs['UseProcesses']) # how many processes to use.
     flattened_to_level = configs['FlattenedToLevel']
@@ -90,7 +93,7 @@ def initialize_config_and_execute(config_values):
         os.makedirs(failed + "/4")
 
     logging.info("------- Values Initialization DONE -------")
-    final_res = execute(pickle_file, dicom_home, output_directory, print_images, print_only_common_headers, print_only_public_headers, depth,
+    final_res = execute(pickle_file, dicom_home, output_directory, print_images, print_only_common_headers, depth,
                         processes, flattened_to_level, email, send_email, no_splits, is16Bit, png_destination,
         failed, maps_directory, meta_directory, LOG_FILENAME, metadata_col_freq_threshold, t_start)
     return final_res
@@ -102,19 +105,54 @@ def get_tuples(plan, outlist = None, key = ""):
         key =  key + "_"
     if not outlist:
         outlist = []
-    for aa  in plan.dir():
+    headers=[]
+    if(SpecificHeadersOnly):
+        if(len(feature_list)==0):
+            logging.error("featureset.txt is empty")
+            sys.exit()
+        try:
+            headers.append(plan['PatientID'])
+        except:
+            plan.PatientId = ""
+            headers.append(plan['PatientId'])
+        try:
+            headers.append(plan['SeriesInstanceUID'])
+        except:
+            plan.SeriesInstanceUID = ""
+            headers.append(plan['SeriesInstanceUID'])
+        try:
+            headers.append(plan['PhotometricInterpretation'])
+        except:
+            plan.PhotometricInterpretation = ""
+            headers.append(plan['PhotometricInterpretation'])
+        try:
+            headers.append(plan['StudyInstanceUID'])
+        except:
+            plan.StudyInstanceUID = ""
+            headers.append(plan['StudyInstanceUID'])
+        for i in feature_list:
+            if plan[i] not in headers:
+                headers.append(plan[i])
+    else:
+        if (public_headers_bool):
+            for aa in plan.dir():
+                headers.append(plan[aa])
+        else:
+            headers = [i for i in plan]
+    for aa  in headers:
         try:
-            hasattr(plan,aa)
+            hasattr(plan,aa.name)
         except TypeError as e:
             logging.warning('Type Error encountered')
             continue
-        if hasattr(plan, aa) and aa!= 'PixelData':
-            value = getattr(plan, aa)
+        name = aa.name.replace(" ", "").replace("[", "").replace("]", "")
+        if name!= 'PixelData':
+            value = aa.value
             start = len(outlist)
             # if dicom sequence extract tags from each element
             if type(value) is dicom.sequence.Sequence:
                 for nn, ss in enumerate(list(value)):
-                    newkey = "_".join([key,("%d"%nn),aa]) if len(key) else "_".join([("%d"%nn),aa])
+                    newkey = "_".join([key,("%d"%nn),name]) if len(key) else "_".join([("%d"%nn),name])
                     candidate = get_tuples(ss,outlist=None,key=newkey)
                     # if extracted tuples are too big condense to a string
                     if len(candidate)>2000:
@@ -130,22 +168,15 @@ def get_tuples(plan, outlist = None, key = ""):
                     value = tuple(value)
                 elif type(value) is dicom.uid.UID:
                     value = str(value)
-                outlist.append((key + aa, value))
-                # appends name, value pair for this file. these are later concatenated to the dataframe
-    # appends the private tags
-    if not public_headers_bool:
-        x = plan.keys()
-        x = list(x)
-        for i in x:
-            if i.is_private:
-                outlist.append((plan[i].name, plan[i].value))
-
+                if (not isinstance(value, str)):
+                    outlist.append((key + name, value))
+                if (isinstance(value, str) and len(value) < 300):
+                    outlist.append((key + name, value))
     return outlist
 
 
 def extract_headers(f_list_elem):
-    global public_headers_bool
-    nn,ff, public_headers_bool = f_list_elem # unpack enumerated list
+    nn,ff = f_list_elem # unpack enumerated list
     plan = dicom.dcmread(ff, force=True)  # reads in dicom file
     # checks if this file has an image
     c=True
@@ -320,7 +351,7 @@ def fix_mismatch(with_VRs=['PN', 'DS', 'IS', 'LO', 'OB']):
     }    
 
 
-def execute(pickle_file, dicom_home, output_directory, print_images, print_only_common_headers, print_only_public_headers, depth,
+def execute(pickle_file, dicom_home, output_directory, print_images, print_only_common_headers, depth,
             processes, flattened_to_level, email, send_email, no_splits, is16Bit, png_destination,
     failed, maps_directory, meta_directory, LOG_FILENAME, metadata_col_freq_threshold, t_start):
     err = None
@@ -380,10 +411,16 @@ def execute(pickle_file, dicom_home, output_directory, print_images, print_only_
         # start up a multi processing pool
         # for every item in filelist send data to a subprocess and run extract_headers func
         # output is then added to headerlist as they are completed (no ordering is done)
+        try:
+            global feature_list
+            feature_list = open("featureset.txt").read().splitlines()
+        except:
+            logging.error("featureset.txt not found")
+            feature_list=[]
+
         with Pool(core_count) as p:
             # we send here print_only_public_headers bool value
-            file_chunks_list = [tups + (print_only_public_headers,) for tups in enumerate(chunk)]
-            res= p.imap_unordered(extract_headers, file_chunks_list)
+            res = p.imap_unordered(extract_headers, enumerate(chunk))
             for i,e in enumerate(res):
                 headerlist.append(e)
         data = pd.DataFrame(headerlist)
@@ -391,7 +428,11 @@ def execute(pickle_file, dicom_home, output_directory, print_images, print_only_
         # find common fields
         # make dataframe containing all fields and all files minus those removed in previous block
         # export csv file of final dataframe
-        export_csv = data.to_csv(csv_destination, index = None, header=True)
+        if (not public_headers_bool) and len(feature_list) > 0:
+            export_csv = data.loc[:, data.columns.isin(feature_list)].to_csv(csv_destination, index=None, header=True)
+        else:
+            export_csv = data.to_csv(csv_destination, index=None, header=True)
+
         fields=data.keys()
         count = 0 # potential painpoint
         # writting of log handled by main process
@@ -493,6 +534,7 @@ def execute(pickle_file, dicom_home, output_directory, print_images, print_only_
     ap.add_argument("--PrintImages", default=niffler['PrintImages'])
     ap.add_argument("--CommonHeadersOnly", default=niffler['CommonHeadersOnly'])
     ap.add_argument("--PublicHeadersOnly", default=niffler['PublicHeadersOnly'])
+    ap.add_argument("--SpecificHeadersOnly", default=niffler['SpecificHeadersOnly'])
     ap.add_argument("--UseProcesses", default=niffler['UseProcesses'])
     ap.add_argument("--FlattenedToLevel", default=niffler['FlattenedToLevel'])
     ap.add_argument("--is16Bit", default=niffler['is16Bit'])
diff --git a/modules/png-extraction/config.json b/modules/png-extraction/config.json
index 12cb197..6f4b64d 100644
--- a/modules/png-extraction/config.json
+++ b/modules/png-extraction/config.json
@@ -6,9 +6,11 @@
 	"PrintImages": true,
 	"CommonHeadersOnly": false,
 	"PublicHeadersOnly": true,
+	"SpecificHeadersOnly": false
 	"UseProcesses": 0,
 	"FlattenedToLevel": "patient",
 	"is16Bit":true,
 	"SendEmail": true,
-	"YourEmail": "test@test.test"
+	"YourEmail": "test@test.test",
+
 }
diff --git a/modules/png-extraction/featureset.txt b/modules/png-extraction/featureset.txt
new file mode 100644
index 0000000..e69de29

From de40fb8bf1f93d63b3fb7f711b476cac48e0b991 Mon Sep 17 00:00:00 2001
From: pavan kumar bellam <60264606+Pavan-Bellam@users.noreply.github.com>
Date: Wed, 23 Mar 2022 07:17:33 +0530
Subject: [PATCH 2/5] Update ImageExtractor.py

---
 modules/png-extraction/ImageExtractor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
index f1827eb..85998c7 100644
--- a/modules/png-extraction/ImageExtractor.py
+++ b/modules/png-extraction/ImageExtractor.py
@@ -428,7 +428,7 @@ def execute(pickle_file, dicom_home, output_directory, print_images, print_only_
         # find common fields
         # make dataframe containing all fields and all files minus those removed in previous block
         # export csv file of final dataframe
-        if (not public_headers_bool) and len(feature_list) > 0:
+        if (SpecificHeadersOnly):
             export_csv = data.loc[:, data.columns.isin(feature_list)].to_csv(csv_destination, index=None, header=True)
         else:
             export_csv = data.to_csv(csv_destination, index=None, header=True)

From d046e616856848c676afe1046098b325a5aee31e Mon Sep 17 00:00:00 2001
From: pavan kumar bellam <60264606+Pavan-Bellam@users.noreply.github.com>
Date: Wed, 23 Mar 2022 07:25:12 +0530
Subject: [PATCH 3/5] Update README.md

---
 modules/png-extraction/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/png-extraction/README.md b/modules/png-extraction/README.md
index 1011998..30d8590 100644
--- a/modules/png-extraction/README.md
+++ b/modules/png-extraction/README.md
@@ -37,6 +37,8 @@ The below two fields can be left unmodified for most executions. The default val
 
 * *PublicHeadersOnly*: Do you want the resulting dataframe csv to contain only the public headers? Then set it as _true_(default). For extract all the private headers set as _false_.
 
+*  *SpecificHeadersOnly* : If you want only certain attributes in extracted csv, Then set this value to true and write the required attribute names in featureset.txt. Default value is false. Do not delete the featureset.txt even if you don't want this only specific headers
+
 
 ## Running the Niffler PNG Extractor
 ```bash

From ff8b861bd6ddb9c499ddfda3e394fdfc9378c6e1 Mon Sep 17 00:00:00 2001
From: Pradeeban Kathiravelu <kk.pradeeban@gmail.com>
Date: Tue, 22 Mar 2022 22:21:01 -0400
Subject: [PATCH 4/5] Fix config.json

---
 modules/png-extraction/config.json | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/png-extraction/config.json b/modules/png-extraction/config.json
index 6f4b64d..f65588f 100644
--- a/modules/png-extraction/config.json
+++ b/modules/png-extraction/config.json
@@ -6,11 +6,10 @@
 	"PrintImages": true,
 	"CommonHeadersOnly": false,
 	"PublicHeadersOnly": true,
-	"SpecificHeadersOnly": false
+	"SpecificHeadersOnly": false,
 	"UseProcesses": 0,
 	"FlattenedToLevel": "patient",
 	"is16Bit":true,
 	"SendEmail": true,
-	"YourEmail": "test@test.test",
-
+	"YourEmail": "test@test.test"
 }

From f860a9185687960473008d5c3b5c8923ce72e26a Mon Sep 17 00:00:00 2001
From: Pradeeban Kathiravelu <kk.pradeeban@gmail.com>
Date: Tue, 22 Mar 2022 22:23:27 -0400
Subject: [PATCH 5/5] Give a sample value to featureset

---
 modules/png-extraction/featureset.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/png-extraction/featureset.txt b/modules/png-extraction/featureset.txt
index e69de29..8e8ab10 100644
--- a/modules/png-extraction/featureset.txt
+++ b/modules/png-extraction/featureset.txt
@@ -0,0 +1 @@
+AccessionNumber