Merge pull request #345 from Emory-HITI/dev

A C-ECHO Method for cold-extraction
Emory-HITI · Aug 26, 2022 · de47043 · de47043
2 parents c5c6835 + 598f22a
commit de47043
Show file tree

Hide file tree

Showing 11 changed files with 176 additions and 51 deletions.
diff --git a/modules/cold-extraction/README.md b/modules/cold-extraction/README.md
@@ -208,7 +208,7 @@ To activate, use the below value,
 	"FilePath": "CFIND-DETAILED",
 ```
 
-## Troubleshooting
+# Troubleshooting
 
 If the process fails even when no one else's Niffler process is running, check your log file (UNIQUE-OUTPUT-FILE-FOR-YOUR-EXTRACTION.out)
 
@@ -241,11 +241,38 @@ $ sudo ps -xa | grep storescp
 
 $ sudo kill 241720
 ```
+## Testing your deployment with Niffler C-ECHO Implementation
 
+Sometimes your connection may not succeed due to firewall issues or because the source PACS not recognizing your end as a valid AET. To confirm and rule out these issues, you can issue a C-ECHO command included with Niffler. 
+
+**First, please make sure your system.json is updated with the correct values for "SrcAet" and "QueryAet"**.
+
+Then, run the below.
+
+````
+$ python3 TestConnection.py
+````
+
+The below output indicates the success.
+````
+C-ECHO request status: 0x0000
+````
+
+If you receive any other output such as the below, that indicates the connection was not successful. 
+````
+Association rejected, aborted or never connected
+````
+
+Please check again the "SrcAet" and "QueryAet" in system.json for correctness. 
+
+If everything is correct in your/Niffler end, please consult your enterprise PACS deployment for configuration. Is it configured correctly to accept queries from your "QueryAet"? Is there a firewall? Is that firewall configured to accept queries **from** your QueryAet (host and port)?
+
+
+## Testing your deployment with DCM4CHE
 
 Niffler strives to be stable for at least the latest stable releases. But since it is still an open-source research project by a university research group, it may have bugs at times - which we aim to fix as soon as we spot. But if your extraction fails for some reason, you could rule out whether the issue is really a Niffler bug or whether some other issue such as some problems in the PACS connection. 
 
-Simply start a storescp and movescu clients (in that order) of DCM4CHE from the server where you are attempting to run. If the below commands work, but Niffler still fails (after correctly following the README), it could indicate a Niffler bug.
+Simply start a storescp and movescu clients (in that order) of DCM4CHE from the server where you are attempting to run Niffler. If the below commands work, but Niffler still fails (after correctly following the README), it could indicate a Niffler bug.
 
 The requests take the below format.
 
@@ -274,3 +301,6 @@ nohup /opt/dcm4che-5.22.5/bin/storescp --accept-unknown --directory new-pydicom
 C-MOVE
 nohup /opt/dcm4che-5.22.5/bin/movescu -c "[email protected]:104" -b "QBNIFFLER:4243" -M PatientRoot -m PatientID=12345678 --dest QBNIFFLER > movescu.out &
 ```
+
+If the testing with DCM4CHE as above does not work, that is an issue likely with your PACS configuration to send DICOM data to your endpoint. Please get the above to work first in that case before attempting the execution with Niffler.
+
diff --git a/modules/cold-extraction/TestConnection.py b/modules/cold-extraction/TestConnection.py
@@ -0,0 +1,38 @@
+from pynetdicom import AE
+from pynetdicom.sop_class import VerificationSOPClass
+import json
+
+# Reads the Niffler system configuration file, "system.json".
+with open('system.json', 'r') as f:
+    niffler = json.load(f)
+
+
+QUERY_AET = niffler['QueryAet']
+query = QUERY_AET.split(':')
+
+ae = AE()
+ae.ae_title = query[0]
+ae.port = query[1]
+
+ae.add_requested_context(VerificationSOPClass)
+
+SRC_AET = niffler['SrcAet']
+srct = SRC_AET.split(':')
+
+port = int(srct[1])
+src = srct[0].split('@')
+
+assoc = ae.associate(src[1], port, ae_title=src[0])
+
+if assoc.is_established:
+    status = assoc.send_c_echo()
+
+    if status:
+        # If successful, the outputstatus will be: 0x0000
+        print('C-ECHO request status: 0x{0:04x}'.format(status.Status))
+    else:
+        print('Connection timed out, was aborted or received invalid response')
+
+    assoc.release()
+else:
+    print('Association rejected, aborted or never connected')
diff --git a/modules/png-extraction/ImageExtractor.py b/modules/png-extraction/ImageExtractor.py
@@ -469,38 +469,39 @@ def execute(pickle_file, dicom_home, output_directory, print_images, print_only_
 
     metas = glob.glob("{}*.csv".format(meta_directory))
     # for each meta  file identify the columns that are not na's for at least 10% (metadata_col_freq_threshold) of data
-    for meta in metas:
-        m = pd.read_csv(meta, dtype='str')
-        d_len = m.shape[0]
-        total_length += d_len
-
-        for e in m.columns:
-            col_pop = d_len - np.sum(m[e].isna())  # number of populated rows for this column in this metadata file
-
-            if e in col_names:
-                col_names[e] += col_pop
-            else:
-                col_names[e] = col_pop
+    if print_only_common_headers:
+        for meta in metas:
+            m = pd.read_csv(meta, dtype='str')
+            d_len = m.shape[0]
+            total_length += d_len
 
-            # all_headers keeps track of number of appearances of each header. We later use this count to ensure that
-            # the headers we use are present in all metadata files.
-            if e in all_headers:
-                all_headers[e] += 1
-            else:
-                all_headers[e] = 1
+            for e in m.columns:
+                col_pop = d_len - np.sum(m[e].isna())  # number of populated rows for this column in this metadata file
 
-    loadable_names = list()
-    for k in col_names.keys():
-        if k in all_headers and all_headers[k] >= no_splits:  # no_splits == number of batches used 
-            if col_names[k] >= metadata_col_freq_threshold * total_length:
-                loadable_names.append(k)  # use header only if it's present in every metadata file
+                if e in col_names:
+                    col_names[e] += col_pop
+                else:
+                    col_names[e] = col_pop
 
-    # load every metadata file using only valid columns
-    meta_list = list()
-    for meta in metas:
-        m = pd.read_csv(meta, dtype='str', usecols=loadable_names)
-        meta_list.append(m)
-    merged_meta = pd.concat(meta_list, ignore_index=True)
+                # all_headers keeps track of number of appearances of each header. We later use this count to ensure that
+                # the headers we use are present in all metadata files.
+                if e in all_headers:
+                    all_headers[e] += 1
+                else:
+                    all_headers[e] = 1
+
+        loadable_names = list()
+        for k in col_names.keys():
+            if k in all_headers and all_headers[k] >= no_splits:  # no_splits == number of batches used
+                if col_names[k] >= metadata_col_freq_threshold * total_length:
+                    loadable_names.append(k)  # use header only if it's present in every metadata file
+
+        # load every metadata file using only valid columns
+        meta_list = list()
+        for meta in metas:
+            m = pd.read_csv(meta, dtype='str', usecols=loadable_names)
+            meta_list.append(m)
+        merged_meta = pd.concat(meta_list, ignore_index=True)
 
     # merging_meta
     merged_meta = pd.DataFrame()

diff --git a/modules/suvpar/Suvpar.py b/modules/suvpar/Suvpar.py
@@ -12,7 +12,7 @@
 
 
 def initialize():
-    global output_csv, df, device_SN, scanner_filter, statistics_csv, isStatistics, final_csv, isAnonymized
+    global output_csv, df, device_SN, scanner_filter, statistics_csv, isStatistics, final_csv, isAnonymized, ris_df, is_merge_with_ris
     with open('config.json', 'r') as f:
         config = json.load(f)
 
@@ -23,6 +23,8 @@ def initialize():
     scanner_filter = bool(config['ScannerFilter'])
     statistics_csv = config['Statistics_File']
     isStatistics = bool(config['IsStatistics'])
+    ris_csv = config['RIS_File']
+    is_merge_with_ris = bool(config['IsMergeWithRis'])
     final_csv = bool(config['IsFinalCSV'])
     isAnonymized = bool(config['IsAnonymized'])
     text_file = open(feature_file, "r")
@@ -31,6 +33,7 @@ def initialize():
     scanner_file = open(scanner_file, "r")
     device_SN = scanner_file.read().split('\n')
     df = pandas.read_csv(filename, usecols=lambda x: x in feature_list, sep=',')
+    ris_df = pandas.read_csv(ris_csv)
 
 
 def suvpar():
@@ -52,8 +55,8 @@ def suvpar():
     df.dropna(subset=["SeriesTime"], inplace=True)
     df.dropna(subset=["SeriesDate"], inplace=True)
     df.dropna(subset=["DeviceSerialNumber"], inplace=True)
-    # Consider only the ImageType that are ORIGINAL.
-    df = df[df['ImageType'].str.contains("ORIGINAL")]
+    # Remove only the ImageType that are NA or NPR.
+    df = df[(~df['ImageType'].str.contains('NA|NPR')) | (df['ImageType'].str.contains('ORIGINAL'))]
     # Consider only MR. Remove modalities such as PR and SR that are present in the original data.
     df = df[df.Modality == "MR"]
     # Dataset after removing unwanted Device Serial Number
@@ -362,6 +365,9 @@ def suvpar():
         sta = df.describe()
         sta.loc['count', 'MultiStudyEncounter'] = df[df['MultiStudyEncounter'] == True]['InstanceNumber'].count()
 
+    if is_merge_with_ris:
+        df = pandas.merge(df, ris_df, on='PatientID')
+
 
 def write():
     global isStatistics

diff --git a/modules/suvpar/config.json b/modules/suvpar/config.json
@@ -6,6 +6,8 @@
   "FeaturesetFile": "featureset1.txt",
   "IsStatistics": false,
   "Statistics_File": "statistic.csv",
+  "RIS_File" : "ris.csv",
+  "IsMergeWithRis" : false,
   "IsFinalCSV": true,
   "IsAnonymized": true
 }
diff --git a/modules/suvpar/featureset1.txt b/modules/suvpar/featureset1.txt
@@ -23,4 +23,5 @@ Manufacturer
 ManufacturerModelName
 SeriesInstanceUID
 [SliceMeasurementDuration]
-[Acquisition Duration]
+[Acquisition Duration]
+ContentTime
diff --git a/modules/suvpar/ris.csv b/modules/suvpar/ris.csv
@@ -0,0 +1,3 @@
+PatientID,Patient Age at Visit,Encounter,EEMR Accession Number,Exam Room,Encounter Type,Ordered Item,Metrics,Order Timestamp,Requested Start Timestamp,Ancillary Arrival Timestamp,Exam Start Timestamp,Exam Completion Timestamp
+1111,44,234232323,23332221,ABC,OV,MRI Brain w/o Contrast,,5/31/20 4:51,5/31/20 4:51,5/31/20 4:51,5/31/20 10:03,5/31/20 10:22
+22222,55,23232332323,222222221,DEF,IN,MRI Brain w/+ w/o Contrast,,5/31/20 7:30,5/31/20 7:30,5/31/20 7:22,5/31/20 11:44,5/31/20 12:25
diff --git a/modules/workflows/Nextflow_workflows/mega_workflow.nf b/modules/workflows/Nextflow_workflows/mega_workflow.nf
@@ -36,7 +36,7 @@ process cold_extraction{
 		params.workflow==1 || params.workflow==2 || params.workflow==3 || params.workflow==4
 	script:
 	"""
-	python3 $pd/Modules/cold_extraction.py --StorageFolder $params.OutputDirectory/workflow_results/cold_extraction_results --FilePath $params.FilePath --CsvFile $params.CsvFile --NumberOfQueryAttributes $params.NumberOfQueryAttributes --FirstAttr $params.FirstAttr --FirstIndex $params.FirstIndex --SecondAttr $params.SecondAttr --SecondIndex $params.SecondIndex --ThirdAttr $params.ThirdAttr --ThirdIndex $params.ThirdIndex --DateFormat $params.DateFormat --SendEmail $params.SendEmail  --YourEmail $params.YourEmail --DCM4CHEBin $params.DCM4CHEBin --SrcAet $params.SrcAet --QueryAet $params.QueryAet --DestAet $params.DestAet --NightlyOnly $params.NightlyOnly --StartHour $params.StartHour --EndHour $params.EndHour --NifflerID $params.NifflerID --MaxNifflerProcesses $params.MaxNifflerProcesses
+	python3 $pd/src/cold_extraction.py --StorageFolder $params.OutputDirectory/workflow_results/cold_extraction_results --FilePath $params.FilePath --CsvFile $params.CsvFile --NumberOfQueryAttributes $params.NumberOfQueryAttributes --FirstAttr $params.FirstAttr --FirstIndex $params.FirstIndex --SecondAttr $params.SecondAttr --SecondIndex $params.SecondIndex --ThirdAttr $params.ThirdAttr --ThirdIndex $params.ThirdIndex --DateFormat $params.DateFormat --SendEmail $params.SendEmail  --YourEmail $params.YourEmail --DCM4CHEBin $params.DCM4CHEBin --SrcAet $params.SrcAet --QueryAet $params.QueryAet --DestAet $params.DestAet --NightlyOnly $params.NightlyOnly --StartHour $params.StartHour --EndHour $params.EndHour --NifflerID $params.NifflerID --MaxNifflerProcesses $params.MaxNifflerProcesses
 	"""
 
 }
@@ -50,7 +50,7 @@ if(params.workflow==1 || params.workflow==2 || params.workflow==3 || params.work
 			val depth into png_ext_out
 		script:
 			"""
-				python3 $pd/Modules/ImageExtractor_nextflow.py --DICOMHome $DICOMHome  --SplitIntoChunks $params.SplitIntoChunks --PrintImages $params.PrintImages --CommonHeadersOnly $params.CommonHeadersOnly --UseProcesses $params.UseProcesses --FlattenedToLevel $params.FlattenedToLevel --is16Bit $params.is16Bit --SendEmail $params.SendEmail --YourEmail $params.YourEmail --PublicHeadersOnly $params.PublicHeadersOnly --Depth $depth
+				python3 $pd/src/ImageExtractor_nextflow.py --DICOMHome $params.OutputDirectory/workflow_results/cold_extraction_results --OutputDirectory $params.OutputDirectory/workflow_results/png_extraction_results --SplitIntoChunks $params.SplitIntoChunks --PrintImages $params.PrintImages --CommonHeadersOnly $params.CommonHeadersOnly --UseProcesses $params.UseProcesses --FlattenedToLevel $params.FlattenedToLevel --is16Bit $params.is16Bit --SendEmail $params.SendEmail --YourEmail $params.YourEmail --PublicHeadersOnly $params.PublicHeadersOnly --Depth $depth
 			"""
 	}
 }
@@ -62,7 +62,7 @@ else{
 			val depth into png_ext_out
 		script:
 			"""
-				python3 $pd/Modules/ImageExtractor_nextflow.py --DICOMHome $params.OutputDirectory/workflow_results/cold_extraction_results --OutputDirectory $params.OutputDirectory/workflow_results/png_extraction_results  --SplitIntoChunks $params.SplitIntoChunks --PrintImages $params.PrintImages --CommonHeadersOnly $params.CommonHeadersOnly --UseProcesses $params.UseProcesses --FlattenedToLevel $params.FlattenedToLevel --is16Bit $params.is16Bit --SendEmail $params.SendEmail --YourEmail $params.YourEmail --PublicHeadersOnly $params.PublicHeadersOnly --Depth $depth
+				python3 $pd/src/ImageExtractor_nextflow.py --DICOMHome $params.DICOMHome --OutputDirectory $params.OutputDirectory/workflow_results/png_extraction_results  --SplitIntoChunks $params.SplitIntoChunks --PrintImages $params.PrintImages --CommonHeadersOnly $params.CommonHeadersOnly --UseProcesses $params.UseProcesses --FlattenedToLevel $params.FlattenedToLevel --is16Bit $params.is16Bit --SendEmail $params.SendEmail --YourEmail $params.YourEmail --PublicHeadersOnly $params.PublicHeadersOnly --Depth $depth
 			"""
 	}
 
@@ -79,7 +79,7 @@ process suvpar{
 
 	"""
 	
-	    python3 $pd/Modules/suvpar.py --InputFile $params.OutputDirectory/workflow_results/png_extraction_results/metadata.csv --OutputFile $params.OutputDirectory/workflow_results/suvpar_resuts/output.csv --FeaturesetFile $params.Featureset_File_for_suvpar
+	    python3 $pd/src/suvpar.py --InputFile $params.OutputDirectory/workflow_results/png_extraction_results/metadata.csv --OutputFile $params.OutputDirectory/workflow_results/suvpar_resuts/output.csv --FeaturesetFile $params.Featureset_File_for_suvpar
 	
 	"""
 
@@ -102,7 +102,7 @@ process meta_anon{
 		params.workflow==1 || params.workflow==4 || params.workflow==5 || params.workflow==8
 	script:
 	"""
-		python3 $pd/Modules/metadata_anonymization.py $params.OutputDirectory/workflow_results/png_extraction_results/metadata.csv $params.OutputDirectory/workflow_results/metaAnon_resuts/output.csv
+		python3 $pd/src/metadata_anonymization.py $params.OutputDirectory/workflow_results/png_extraction_results/metadata.csv $params.OutputDirectory/workflow_results/metaAnon_resuts/output.csv
 	"""
 }
 
diff --git a/modules/workflows/Nextflow_workflows/nextflow.config b/modules/workflows/Nextflow_workflows/nextflow.config
@@ -38,4 +38,9 @@ params{
 	Featureset_File_for_png_extraction=""
 
 	Featureset_File_for_suvpar="/path/to/featureset"
+	ScannerDetails="path/to/scannerDetails.txt"
+	ScannerFilter=false
+	IsStatistics=false
+	IsFinalCSV=false
+	IsAnonymized=false
 }
diff --git a/modules/workflows/Nextflow_workflows/src/cold_extraction.py b/modules/workflows/Nextflow_workflows/src/cold_extraction.py
@@ -3,6 +3,10 @@
 import argparse
 import logging
 import time
+import shutil
+import schedule
+import signal
+
 script_dir = os.path.dirname( __file__ )
 module_dir=os.path.join(script_dir,"..","..","..","cold-extraction")
 print("script_dir is:",module_dir)
@@ -36,11 +40,6 @@
 ap.add_argument("--MaxNifflerProcesses", type=int)
 valuesDict = vars(ap.parse_args())
 
-
-
-
-
-
 ColdDataRetriever.storage_folder = valuesDict['StorageFolder']
 ColdDataRetriever.file_path = valuesDict['FilePath']
 ColdDataRetriever.csv_file = valuesDict['CsvFile']
@@ -54,6 +53,8 @@
 ColdDataRetriever.date_format = valuesDict['DateFormat']
 ColdDataRetriever.email = valuesDict['YourEmail']
 ColdDataRetriever.send_email = bool(valuesDict['SendEmail'])
+ColdDataRetriever.mod_csv_file = ColdDataRetriever.csv_file[:-4]+'_mod.csv'
+shutil.copyfile(ColdDataRetriever.csv_file, ColdDataRetriever.mod_csv_file)
 
 ColdDataRetriever.DCM4CHE_BIN = valuesDict['DCM4CHEBin']
 ColdDataRetriever.SRC_AET = valuesDict['SrcAet']
@@ -119,8 +120,26 @@
 
 # record the start time
 ColdDataRetriever.t_start = time.time()
-ColdDataRetriever.run_cold_extraction()
-
-
 
+ColdDataRetriever.read_csv()
+    # The thread scheduling
+schedule.every(1).minutes.do(ColdDataRetriever.run_threaded, ColdDataRetriever.run_retrieval)    
+schedule.every(10).minutes.do(ColdDataRetriever.run_threaded, ColdDataRetriever.update_pickle)
+
+# Keep running in a loop.
+while True:
+    try:
+        schedule.run_pending()
+        time.sleep(1)
+    except KeyboardInterrupt:
+        ColdDataRetriever.check_kill_process()
+        logging.shutdown()
+        break
+
+
+for line in os.popen("ps -ax | grep storescp"):
+        fields = line.split()
+        pid = fields[0]
+        print(pid)
+        os.kill(int(pid), signal.SIGKILL)
 
diff --git a/modules/workflows/Nextflow_workflows/src/suvpar.py b/modules/workflows/Nextflow_workflows/src/suvpar.py
@@ -2,6 +2,7 @@
 import sys
 import argparse
 import pandas
+import logging
 script_dir = os.path.dirname( __file__ )
 module_dir=os.path.join(script_dir,"..","..","..","suvpar")
 print("script_dir is:",module_dir)
@@ -12,15 +13,34 @@
 ap.add_argument("--InputFile")
 ap.add_argument("--OutputFile")
 ap.add_argument("--FeaturesetFile")
+ap.add_argument("--ScannerDetails")
+ap.add_argument("--ScannerFilter")
+ap.add_argument("--Statistics_File")
+ap.add_argument("--IsStatistics")
+ap.add_argument("--IsFinalCSV")
+ap.add_argument("--IsAnonymized")
+
 config = vars(ap.parse_args())
-global output_csv, df
+logging.basicConfig(level=logging.INFO)
+Suvpar.df = {}
+Suvpar.sta = {}
+Suvpar.statistics_csv = {}
+Suvpar.output_csv = {}
+
+
+global output_csv, df,  device_SN, scanner_filter, statistics_csv, isStatistics, final_csv, isAnonymized
 Suvpar.feature_file = config['FeaturesetFile']
 Suvpar.filename = config['InputFile']
 Suvpar.output_csv = config['OutputFile']
-
+Suvpar.scanner_file = config['ScannerDetails']
+Suvpar.scanner_filter = bool(config['ScannerFilter'])
+Suvpar.statistics_csv = config['Statistics_File']
+Suvpar.isStatistics = bool(config['IsStatistics'])
+Suvpar.final_csv = bool(config['IsFinalCSV'])
+Suvpar.isAnonymized = bool(config['IsAnonymized'])
 Suvpar.text_file = open(Suvpar.feature_file, "r")
 Suvpar.feature_list = Suvpar.text_file.read().split('\n')
 
 Suvpar.df = pandas.read_csv(Suvpar.filename, usecols=lambda x: x in Suvpar.feature_list, sep=',')
 Suvpar.suvpar()
-Suvpar.write()
+Suvpar.write()