Merge pull request #84 from chorus-ai/format-dicom

DICOM Fidelity Check failure
chorus-ai · Jul 25, 2024 · c1c5c93 · c1c5c93
2 parents 93ce7aa + 74778e8
commit c1c5c93
Show file tree

Hide file tree

Showing 6 changed files with 463 additions and 257 deletions.
diff --git a/BENCHMARK.md b/BENCHMARK.md
@@ -9,7 +9,7 @@ The benchmarking script includes a simple set of metrics to evaluate the perform
 The [waveform_benchmark.py](./waveform_benchmark.py) script is the entrypoint for running benchmarks. This script calls functions in the `waveform_benchmark` package. The syntax for running waveform_benchmark.py from the command line is: 
 
 ```
-./waveform_benchmark.py -r <PATH_TO_RECORD> -f <PATH_TO_BENCHMARK_CLASS> -p <PHYSIONET_DATABASE_PATH> [-m]
+./waveform_benchmark.py -r <PATH_TO_RECORD> -f <PATH_TO_BENCHMARK_CLASS> -p <PHYSIONET_DATABASE_PATH> [--test_only] [-m]
 ```
 
 The `-p` argument can be used to pull a file directly from a PhysioNet database but isn't needed when running on a local file. For example, to run the `WFDBFormat516` benchmark on local record `data/waveforms/mimic_iv/waves/p100/p10079700/85594648/85594648`:
@@ -18,6 +18,8 @@ The `-p` argument can be used to pull a file directly from a PhysioNet database
 ./waveform_benchmark.py -r ./data/waveforms/mimic_iv/waves/p100/p10079700/85594648/85594648 -f waveform_benchmark.formats.wfdb.WFDBFormat516
 ```
 
+The `--test_only` flag can be used to turn off the read performance benchmarking thus run only the fidelity tests.
+
 An example output is provided below:
 
 ```

diff --git a/waveform_benchmark/__main__.py b/waveform_benchmark/__main__.py
@@ -82,6 +82,9 @@ def main():
     ap.add_argument('--waveform_suite_summary_file', '-w',
                     default='waveform_suite_benchmark_summary.csv',
                     help='Save a CSV summary of the waveform suite run to this path/file')
+    ap.add_argument('--test_only', 
+                    default=False, action='store_true',
+                    help='Run only the tests, do not run the benchmarks')
     ap.add_argument('--memory_profiling', '-m',
                     default=False, type=bool, action=argparse.BooleanOptionalAction,
                     help='Run memory profiling on the benchmarking process')
@@ -117,6 +120,7 @@ def main():
                                                                                 waveform_list=waveform_list,
                                                                                 test_list=test_list,
                                                                                 result_list=result_list,
+                                                                                test_only = opts.test_only,
                                                                                 mem_profile = opts.memory_profile)
 
         save_summary(format_list, waveform_list, test_list, result_list, opts.waveform_suite_summary_file)
@@ -126,6 +130,7 @@ def main():
         run_benchmarks(input_record=opts.input_record,
                        format_class=opts.format_class,
                        pn_dir=opts.physionet_directory,
+                       test_only = opts.test_only,
                        mem_profile = opts.memory_profiling)
 
     # Close the log file after the run is complete

diff --git a/waveform_benchmark/benchmark.py b/waveform_benchmark/benchmark.py
@@ -46,13 +46,14 @@ def _run_read_test(fmt, path, total_length, all_channels, block_length, block_co
 def _run_read_test_1channel(fmt, path, total_length, all_channels, block_length, block_count,
                             test_min_dur = 10, test_min_iter = 3, mem_profile = False):
     counters = []
+    r = random.Random(12345)
     for i in repeat_test(test_min_dur, test_min_iter):
-        r = random.Random(12345)
         with PerformanceCounter(mem_profile = mem_profile) as pc:
             for j in range(block_count):
                 t0 = r.random() * (total_length - block_length)
                 t1 = t0 + block_length
-                fmt().read_waveforms(path, t0, t1, all_channels)    
+                c = r.choice(all_channels)
+                fmt().read_waveforms(path, t0, t1, [c])    
         counters.append(pc)
     return counters
 
@@ -87,7 +88,7 @@ def compute_snr(reference_signal, output_signal):
 
 
 def run_benchmarks(input_record, format_class, pn_dir=None, format_list=None, waveform_list=None, test_list=None,
-                   result_list=None, mem_profile = False):
+                   result_list=None, mem_profile = False, test_only = False):
 
     # Load the class we will be testing
     module_name, class_name = format_class.rsplit('.', 1)
@@ -249,7 +250,7 @@ def run_benchmarks(input_record, format_class, pn_dir=None, format_list=None, wa
                 # use numpy's isclose to determine floating point equality
                 isgood = np.isclose(filedata_nonan, data_nonan, atol=0.5/chunk['gain'])
                 numgood = np.sum(isgood)
-                fpeq_rel = numgood/len(data_nonan)
+                fpeq_rel = numgood/len(data_nonan) if len(data_nonan) != 0 else np.nan
 
                 # compute SNR to quantify signal fidelity
                 snr = compute_snr(data_nonan, filedata_nonan)
@@ -265,97 +266,97 @@ def run_benchmarks(input_record, format_class, pn_dir=None, format_list=None, wa
                     print(filedata_nonan[~isgood][:10])
                     print(f"(Gain: {chunk['gain']})")
             # print('_' * 64)
-        print('_' * 64)
-        print('Read performance (median of N trials):')
-        if (mem_profile):
-            print(' #seek  #read      KiB   CPU(s)    Wall(s)    Mem(MB)(used/maxrss/malloced)       [N]')
-        else:
-            print(' #seek  #read      KiB   CPU(s)    Wall(s)        [N]')
-
-        for block_length, block_count in TEST_BLOCK_LENGTHS:
-
-            # time1 = time.time()
-            counters = []
-            if (mem_profile):
-                mem_usage, counters = memory_usage((_run_read_test, (fmt, path, total_length, all_channels, block_length, block_count), {'test_min_dur': TEST_MIN_DURATION, 'test_min_iter': TEST_MIN_ITERATIONS, 'mem_profile': mem_profile}), include_children = True, max_usage = True, retval = True)
-            else:
-                counters = _run_read_test(fmt, path, total_length, all_channels, block_length, block_count, test_min_dur = TEST_MIN_DURATION, test_min_iter = TEST_MIN_ITERATIONS, mem_profile = mem_profile)
-            # walltime = time.time() - time1
-
-            if (mem_profile):
-                print('%6.0f %6.0f %8.0f %8.4f   %8.4f     %8.4f/%8.4f/%8.4f    %6s read %d x %.0fs, all channels'
-                    % (median_attr(counters, 'n_seek_calls'),
-                        median_attr(counters, 'n_read_calls'),
-                        median_attr(counters, 'n_bytes_read') / 1024,
-                        median_attr(counters, 'cpu_seconds'),
-                        median_attr(counters, 'walltime'), 
-                        # walltime / len(counters),
-                        mem_usage,
-                        median_attr(counters, 'max_rss'),
-                        median_attr(counters, 'malloced'),
-                        '[%d]' % len(counters),
-                        block_count,
-                        block_length))
-            else:
-                print('%6.0f %6.0f %8.0f %8.4f   %8.4f     %6s read %d x %.0fs, all channels'
-                    % (median_attr(counters, 'n_seek_calls'),
-                        median_attr(counters, 'n_read_calls'),
-                        median_attr(counters, 'n_bytes_read') / 1024,
-                        median_attr(counters, 'cpu_seconds'),
-                        median_attr(counters, 'walltime'), 
-                        '[%d]' % len(counters),
-                        block_count,
-                        block_length))
-            if format_list is not None:
-                # Append read time result
-                format_list, waveform_list, test_list, result_list = append_result(format_class, input_record,
-                                                                                   f'{block_count}_all',
-                                                                                   median_attr(counters, 'cpu_seconds'),
-                                                                                   format_list,
-                                                                                   waveform_list, test_list,
-                                                                                   result_list)
-
-        for block_length, block_count in TEST_BLOCK_LENGTHS:
-            # time1 = time.time()
-            counters = []
+        if not test_only:
+            print('_' * 64)
+            print('Read performance (median of N trials):')
             if (mem_profile):
-                mem_usage, counters = memory_usage((_run_read_test_1channel, (fmt, path, total_length, all_channels, block_length, block_count), {'test_min_dur': TEST_MIN_DURATION, 'test_min_iter': TEST_MIN_ITERATIONS, 'mem_profile': mem_profile}), include_children = True, max_usage = True, retval = True)
+                print(' #seek  #read      KiB   CPU(s)    Wall(s)    Mem(MB)(used/maxrss/malloced)       [N]')
             else:
-                counters = _run_read_test_1channel(fmt, path, total_length, all_channels, block_length, block_count, test_min_dur = TEST_MIN_DURATION, test_min_iter = TEST_MIN_ITERATIONS, mem_profile = mem_profile)
-            # walltime = time.time() - time1
-
-            if (mem_profile):
-                print('%6.0f %6.0f %8.0f %8.4f   %8.4f     %8.4f/%8.4f/%8.4f    %6s read %d x %.0fs, one channel'
-                    % (median_attr(counters, 'n_seek_calls'),
-                        median_attr(counters, 'n_read_calls'),
-                        median_attr(counters, 'n_bytes_read') / 1024,
-                        median_attr(counters, 'cpu_seconds'),
-                        # walltime / len(counters),
-                        median_attr(counters, 'walltime'),
-                        mem_usage,
-                        median_attr(counters, 'max_rss'),
-                        median_attr(counters, 'malloced'),
-                        '[%d]' % len(counters),
-                        block_count,
-                        block_length))
-            else:
-                print('%6.0f %6.0f %8.0f %8.4f   %8.4f     %6s read %d x %.0fs, one channel'
-                    % (median_attr(counters, 'n_seek_calls'),
-                        median_attr(counters, 'n_read_calls'),
-                        median_attr(counters, 'n_bytes_read') / 1024,
-                        median_attr(counters, 'cpu_seconds'),
-                        median_attr(counters, 'walltime'),
-                        # walltime / len(counters),
-                        '[%d]' % len(counters),
-                        block_count,
-                        block_length))
-            if format_list:
-                format_list, waveform_list, test_list, result_list = append_result(format_class, input_record,
-                                                                                   f'{block_count}_one',
-                                                                                   median_attr(counters, 'cpu_seconds'),
-                                                                                   format_list,
-                                                                                   waveform_list, test_list,
-                                                                                   result_list)
+                print(' #seek  #read      KiB   CPU(s)    Wall(s)        [N]')
+
+            for block_length, block_count in TEST_BLOCK_LENGTHS:
+
+                # time1 = time.time()
+                counters = []
+                if (mem_profile):
+                    mem_usage, counters = memory_usage((_run_read_test, (fmt, path, total_length, all_channels, block_length, block_count), {'test_min_dur': TEST_MIN_DURATION, 'test_min_iter': TEST_MIN_ITERATIONS, 'mem_profile': mem_profile}), include_children = True, max_usage = True, retval = True)
+                else:
+                    counters = _run_read_test(fmt, path, total_length, all_channels, block_length, block_count, test_min_dur = TEST_MIN_DURATION, test_min_iter = TEST_MIN_ITERATIONS, mem_profile = mem_profile)
+                # walltime = time.time() - time1
+
+                if (mem_profile):
+                    print('%6.0f %6.0f %8.0f %8.4f   %8.4f     %8.4f/%8.4f/%8.4f    %6s read %d x %.0fs, all channels'
+                        % (median_attr(counters, 'n_seek_calls'),
+                            median_attr(counters, 'n_read_calls'),
+                            median_attr(counters, 'n_bytes_read') / 1024,
+                            median_attr(counters, 'cpu_seconds'),
+                            median_attr(counters, 'walltime'), 
+                            # walltime / len(counters),
+                            mem_usage,
+                            median_attr(counters, 'max_rss'),
+                            median_attr(counters, 'malloced'),
+                            '[%d]' % len(counters),
+                            block_count,
+                            block_length))
+                else:
+                    print('%6.0f %6.0f %8.0f %8.4f   %8.4f     %6s read %d x %.0fs, all channels'
+                        % (median_attr(counters, 'n_seek_calls'),
+                            median_attr(counters, 'n_read_calls'),
+                            median_attr(counters, 'n_bytes_read') / 1024,
+                            median_attr(counters, 'cpu_seconds'),
+                            median_attr(counters, 'walltime'), 
+                            '[%d]' % len(counters),
+                            block_count,
+                            block_length))
+                if format_list is not None:
+                    # Append read time result
+                    format_list, waveform_list, test_list, result_list = append_result(format_class, input_record,
+                                                                                    f'{block_count}_all',
+                                                                                    median_attr(counters, 'cpu_seconds'),
+                                                                                    format_list,
+                                                                                    waveform_list, test_list,
+                                                                                    result_list)
+
+            for block_length, block_count in TEST_BLOCK_LENGTHS:
+                counters = []
+                if (mem_profile):
+                    mem_usage, counters = memory_usage((_run_read_test_1channel, (fmt, path, total_length, all_channels, block_length, block_count), {'test_min_dur': TEST_MIN_DURATION, 'test_min_iter': TEST_MIN_ITERATIONS, 'mem_profile': mem_profile}), include_children = True, max_usage = True, retval = True)
+                else:
+                    counters = _run_read_test_1channel(fmt, path, total_length, all_channels, block_length, block_count, test_min_dur = TEST_MIN_DURATION, test_min_iter = TEST_MIN_ITERATIONS, mem_profile = mem_profile)
+                # walltime = time.time() - time1
+
+                if (mem_profile):
+                    print('%6.0f %6.0f %8.0f %8.4f   %8.4f     %8.4f/%8.4f/%8.4f    %6s read %d x %.0fs, one channels'
+                        % (median_attr(counters, 'n_seek_calls'),
+                            median_attr(counters, 'n_read_calls'),
+                            median_attr(counters, 'n_bytes_read') / 1024,
+                            median_attr(counters, 'cpu_seconds'),
+                            median_attr(counters, 'walltime'), 
+                            # walltime / len(counters),
+                            mem_usage,
+                            median_attr(counters, 'max_rss'),
+                            median_attr(counters, 'malloced'),
+                            '[%d]' % len(counters),
+                            block_count,
+                            block_length))
+                else:
+                    print('%6.0f %6.0f %8.0f %8.4f   %8.4f     %6s read %d x %.0fs, one channels'
+                        % (median_attr(counters, 'n_seek_calls'),
+                            median_attr(counters, 'n_read_calls'),
+                            median_attr(counters, 'n_bytes_read') / 1024,
+                            median_attr(counters, 'cpu_seconds'),
+                            median_attr(counters, 'walltime'), 
+                            '[%d]' % len(counters),
+                            block_count,
+                            block_length))
+
+                if format_list:
+                    format_list, waveform_list, test_list, result_list = append_result(format_class, input_record,
+                                                                                    f'{block_count}_one',
+                                                                                    median_attr(counters, 'cpu_seconds'),
+                                                                                    format_list,
+                                                                                    waveform_list, test_list,
+                                                                                    result_list)
 
     print('_' * 64)
 

diff --git a/waveform_benchmark/formats/dcm_utils/dcm_waveform_reader.py b/waveform_benchmark/formats/dcm_utils/dcm_waveform_reader.py
@@ -175,21 +175,29 @@ def get_multiplex_array(fileobj : BinaryIO,
         chseq = cast(list["Dataset"], channel_seqs)
 
         # Apply correction factor (if possible)
+        padd_loc = (arr == padding_value)
         arr = arr.astype("float")
         for ch_idx, ch in enumerate(chseq):
             baseline = float(ch.ChannelBaseline)
             sensitivity = float(ch.ChannelSensitivity)
             correction = float(ch.ChannelSensitivityCorrectionFactor)
+
+            # print("baseline " + str(baseline) + " sensitivity " + str(sensitivity) + " correction " + str(correction)) 
+
             adjustment = sensitivity * correction
-            if (adjustment != 1.0) and (baseline != 0.0):
-                arr[ch_idx, ...] = np.where(arr[ch_idx, ...] == padding_value, np.nan, arr[ch_idx, ...] * adjustment + baseline)
-            elif (adjustment != 1.0):
-                arr[ch_idx, ...] = np.where(arr[ch_idx, ...] == padding_value, np.nan, arr[ch_idx, ...] * adjustment)
-            elif (baseline != 0.0):
-                arr[ch_idx, ...] = np.where(arr[ch_idx, ...] == padding_value, np.nan, arr[ch_idx, ...] + baseline)
+            base = baseline
+            # print(" reading ", ch_idx, sensitivity, baseline, correction, adjustment, base)
+            if (adjustment != 1.0):
+                if (base != 0.0):
+                    arr[ch_idx, ...] = np.where(padd_loc[ch_idx, ...], np.nan, arr[ch_idx, ...] * adjustment - base)
+                else:
+                    arr[ch_idx, ...] = np.where(padd_loc[ch_idx, ...], np.nan, arr[ch_idx, ...] * adjustment)
             else:
-                arr[ch_idx, ...] = np.where(arr[ch_idx, ...] == padding_value, np.nan, arr[ch_idx, ...])
-
+                if (base != 0.0):
+                    arr[ch_idx, ...] = np.where(padd_loc[ch_idx, ...], np.nan, arr[ch_idx, ...] - base)
+                else:
+                    arr[ch_idx, ...] = np.where(padd_loc[ch_idx, ...], np.nan, arr[ch_idx, ...])
+            # print(ch.ChannelSourceSequence[0].CodeMeaning + " sensitivity " + str(sensitivity) + " gain = " + str(1.0 / sensitivity) + " baseline = " + str(baseline) + " correction = " + str(correction)) 
 
     return cast("np.ndarray", arr)