Merge pull request #252 from Emory-HITI/dev

Updates to the suvpar module
Emory-HITI · Dec 14, 2021 · af7e128 · af7e128
2 parents fabf7b9 + 77091ad
commit af7e128
Show file tree

Hide file tree

Showing 7 changed files with 100 additions and 63 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ Niffler is a lightweight framework to facilitate executing machine learning pipe
 The Niffler framework consists of:
 - On-demand and real-time retrieval and processing of DICOM images from the PACS environment configured to accept requests from a deployment of Niffler.
 - Acquisition and processing of clinical data from a RIS, to enable real-time analytics (RTA).
-- Supportive utility functions such as DICOM → PNG conversion, DICOM → NifTi conversion, DICOM anonymization, and a workflow module.
+- Supportive utility functions such as DICOM → PNG conversion, DICOM → NifTi conversion, DICOM anonymization, and the workflows module.
 - Sample applications of the Niffler modules.
 
 Niffler enables receiving DICOM images real-time as a data stream from PACS as well as specific DICOM data based on a series of DICOM C-MOV queries. The Niffler real-time DICOM receiver extracts the metadata free of PHI as the images arrive, store the metadata in a Mongo database, and deletes the images nightly. The on-demand extractor reads a CSV file provided by the user (consisting of a list of values for PatientID, AccessionNumber, or other DICOM keywords), and performs a series of DICOM C-MOVE requests to receive them from the PACS, without manually querying them. Niffler also provides additional features such as converting DICOM images into PNG images, and perform additional computations such as computing scanner utilization and finding scanners with misconfigured clocks.

diff --git a/modules/app-layer/pom.xml b/modules/app-layer/pom.xml
@@ -47,7 +47,7 @@
 		<dependency>
 			<groupId>org.apache.logging.log4j</groupId>
 			<artifactId>log4j-core</artifactId>
-			<version>2.15.0</version>
+			<version>2.16.0</version>
 		</dependency>
 	</dependencies>
 

diff --git a/modules/suvpar/README.md b/modules/suvpar/README.md
@@ -20,5 +20,5 @@ Find the config.json file in the folder and modify accordingly.
 First, to run the script to trim the file.
 
 ````
-$ python3 Strip.py
+$ python3 Suvpar.py
 ````
diff --git a/modules/suvpar/Strip.py b/modules/suvpar/Strip.py
diff --git a/modules/suvpar/Suvpar.py b/modules/suvpar/Suvpar.py
@@ -0,0 +1,93 @@
+import pandas
+import logging
+import json
+
+logging.basicConfig(level=logging.INFO)
+df = {}
+output_csv = {}
+drop = True
+
+
+def initialize():
+    global output_csv, df
+    with open('config.json', 'r') as f:
+        config = json.load(f)
+
+    feature_file = config['FeaturesetFile']
+    filename = config['InputFile']
+    output_csv = config['OutputFile']
+
+    text_file = open(feature_file, "r")
+    feature_list = text_file.read().split('\n')
+
+    df = pandas.read_csv(filename, usecols=lambda x: x in feature_list, sep=',')
+
+
+def strip():
+    global df
+    # Drop entries without an ImageType, AcquisitionTime, SeriesInstanceUID,
+    # AcquisitionDate, AccessionNumber, or DeviceSerialNumber entry.
+    df.dropna(subset=["ImageType"], inplace=True)
+    df.dropna(subset=["AccessionNumber"], inplace=True)
+    df.dropna(subset=["SeriesInstanceUID"], inplace=True)
+    df.dropna(subset=["AcquisitionTime"], inplace=True)
+    df.dropna(subset=["AcquisitionDate"], inplace=True)
+    df.dropna(subset=["DeviceSerialNumber"], inplace=True)
+    # Consider only the ImageType that are ORIGINAL.
+    df = df[df['ImageType'].str.contains("ORIGINAL")]
+    # Consider only MR. Remove modalities such as PR and SR that are present in the original data.
+    df = df[df.Modality == "MR"]
+    df['AcquisitionDateTime'] = df['AcquisitionDate'].astype(int).astype(str) + \
+                            df['AcquisitionTime'].astype(float).astype(str)
+    df['AcquisitionDateTime'] = pandas.to_datetime(df['AcquisitionDateTime'], format='%Y%m%d%H%M%S.%f')
+    df['AcquisitionDateTime'] = df['AcquisitionDateTime'].dt.strftime('%Y/%m/%d %H:%M:%S.%f')
+    df = df.join(
+        df.groupby('SeriesInstanceUID')['AcquisitionDateTime'].aggregate(['min', 'max']),
+        on='SeriesInstanceUID')
+    df.rename(columns={'min': 'SeriesStartTime'}, inplace=True)
+    df.rename(columns={'max': 'SeriesEndTime'}, inplace=True)
+    df['SeriesStartTime'] = pandas.to_datetime(df['SeriesStartTime'])
+    df['SeriesEndTime'] = pandas.to_datetime(df['SeriesEndTime'])
+    df['SeriesDurationInMins'] = (df.SeriesEndTime - df.SeriesStartTime).dt.seconds / 60.0
+
+    if drop:
+        # Keep only one instance per series. 322,866 rows drops to 3,656 in a tested sample, by this step.
+        df = df.drop_duplicates('SeriesInstanceUID')
+        df = df.drop(columns=['AcquisitionDate'])
+        df = df.drop(columns=['AcquisitionTime'])
+
+    df = df.join(df.groupby('AccessionNumber')['AcquisitionDateTime'].aggregate(['min', 'max']), on='AccessionNumber')
+    df.rename(columns={'min': 'StudyStartTime'}, inplace=True)
+    df.rename(columns={'max': 'StudyEndTime'}, inplace=True)
+    df['StudyStartTime'] = pandas.to_datetime(df['StudyStartTime'])
+    df['StudyEndTime'] = pandas.to_datetime(df['StudyEndTime'])
+    df['StudyDurationInMins'] = (df.StudyEndTime - df.StudyStartTime).dt.seconds / 60.0
+
+    df = df.join(df.groupby('PatientID')['AcquisitionDateTime'].aggregate(['min', 'max']), on='PatientID')
+    df.rename(columns={'min': 'PatientStartTime'}, inplace=True)
+    df.rename(columns={'max': 'PatientEndTime'}, inplace=True)
+    df['PatientStartTime'] = pandas.to_datetime(df['StudyStartTime'])
+    df['PatientEndTime'] = pandas.to_datetime(df['StudyEndTime'])
+    df['PatientDurationInMins'] = (df.PatientEndTime - df.PatientStartTime).dt.seconds / 60.0
+
+    df = df.join(df.groupby('DeviceSerialNumber')['AcquisitionDateTime'].aggregate(['min', 'max']),
+                 on='DeviceSerialNumber')
+    # Estimating the last scan as the scanner off.
+    df.rename(columns={'min': 'ScannerOn'}, inplace=True)
+    df.rename(columns={'max': 'ScannerOff'}, inplace=True)
+    df['ScannerOn'] = pandas.to_datetime(df['ScannerOn'])
+    df['ScannerOff'] = pandas.to_datetime(df['ScannerOff'])
+    df['ScannerTotalOnTimeInMins'] = (df.ScannerOff - df.ScannerOn).dt.seconds / 60.0
+
+    # Sort by "DeviceSerialNumber" and "SeriesStartTime"
+    df = df.sort_values(["DeviceSerialNumber", "SeriesStartTime"])
+
+
+def write():
+    df.to_csv(output_csv)
+
+
+if __name__ == "__main__":
+    initialize()
+    strip()
+    write()
diff --git a/modules/suvpar/featureset.txt b/modules/suvpar/featureset.txt
@@ -44,4 +44,5 @@ FlipAngle
 SAR
 Modality
 Manufacturer
-ManufacturerModelName
+ManufacturerModelName
+SeriesInstanceUID
diff --git a/modules/suvpar/featureset1.txt b/modules/suvpar/featureset1.txt
@@ -20,4 +20,5 @@ InstanceNumber
 AcquisitionDuration
 Modality
 Manufacturer
-ManufacturerModelName
+ManufacturerModelName
+SeriesInstanceUID