-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathcharacterize_data.py
468 lines (437 loc) · 20.3 KB
/
characterize_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
import SimpleITK as sitk
import pandas as pd
import numpy as np
import os
import sys
import shutil
import subprocess
import platform
# We use the multiprocess package instead of the official
# multiprocessing as it currently has several issues as discussed
# on the software carpentry page: https://hpc-carpentry.github.io/hpc-python/06-parallel/
import multiprocess as mp
from functools import partial
import argparse
import hashlib
import tempfile
# Maximal number of parallel processes we run.
MAX_PROCESSES = 15
"""
This script inspects/charachterizes images in a given directory structure. It
recuresivly traverses the directories and either inspects the files one by one
or if in DICOM series inspection mode, inspects the data on a per series basis
(all 2D series files combined into a single 3D image).
To run the script one needs to specify:
1. Root of the data directory.
2. Output file name.
3. The analysis type to perform per_file or per_series. The latter indicates
we are only interested in DICOM files. When run using per_file empty lines
in the results file are due to:
a. The file is not an image or is a corrupt image file.
b. SimpleITK was unable to read the image file (contact us with an example).
4. Optional SimpleITK imageIO to use. The default value is
the empty string, indicating that all file types should be read.
To see the set of ImageIO types supported by your version of SimpleITK,
call ImageFileReader::GetRegisteredImageIOs() or simply print an
ImageFileReader object.
5. Optional exteranl applications to run. Their return value (zero or
non zero) is used to log success or failure. A nice example is the
dciodvfy program from David Clunie (https://www.dclunie.com/dicom3tools.html)
which validates compliance with the DICOM standard.
6. When the external applications are provided corrosponding column headings
are also required. These are used in the output csv file.
7. Optional metadata keys. These are image specific keys such as DICOM tags
or other metadata tags that may be found in the image. The content of the
tags is written to the result file.
8. When the metadata tags are provided corrosponding column headings
are also required. These are used in the output csv file.
Examples:
Run a generic file analysis:
python characterize_data.py data/ output/generic_image_data_report.csv per_file \
--imageIO "" --external_applications ./dciodvfy --external_applications_headings "DICOM Compliant" \
--metadata_keys "0008|0060" "0018|5101" --metadata_keys_headings "modality" "radiographic view"
Run a DICOM series based analysis:
python characterize_data.py data/ output/DICOM_image_data_report.csv per_series \
--metadata_keys "0008|0060" "0018|5101" --metadata_keys_headings "modality" "radiographic view"
"""
def inspect_image(sitk_image, image_info, current_index, meta_data_keys=[]):
"""
Inspect a SimpleITK image, return a list of parameters characterizing the image.
Parameters
----------
sitk_image (SimpleITK.Image): Input image for inspection.
image_info (list): Image information is written to this list, starting at current_index.
[,,,MD5 intensity hash,
image size, image spacing, image origin, axis direction,
pixel type, min intensity, max intensity,
meta data_1...meta_data_n,,,]
current_index (int): Starting index into the image_info list.
meta_data_keys(list(str)): The image's meta-data dictionary keys whose value we want to
inspect.
Returns
-------
index to the next empty entry in the image_info list.
The image_info list is filled with the following values:
MD5 intensity hash - Enable identification of duplicate images in terms of intensity.
This is different from SimpleITK image equality where the
same intensities with different image spacing/origin/direction cosine
are considered different images as they occupy a different spatial
region.
image size - number of pixels in each dimension.
pixel type - type of pixels (scalar - gray, vector - gray or color).
min/max intensity - if a scalar image, min and max values.
meta data_i - value of image's metadata dictionary for given key (e.g. .
"""
image_info[current_index] = hashlib.md5(
sitk.GetArrayViewFromImage(sitk_image)
).hexdigest()
current_index = current_index + 1
image_info[current_index] = sitk_image.GetSize()
current_index = current_index + 1
image_info[current_index] = sitk_image.GetSpacing()
current_index = current_index + 1
image_info[current_index] = sitk_image.GetOrigin()
current_index = current_index + 1
image_info[current_index] = sitk_image.GetDirection()
current_index = current_index + 1
if (
sitk_image.GetNumberOfComponentsPerPixel() == 1
): # greyscale image, get the min/max pixel values
image_info[current_index] = sitk_image.GetPixelIDTypeAsString() + " gray"
current_index = current_index + 1
mmfilter = sitk.MinimumMaximumImageFilter()
mmfilter.Execute(sitk_image)
image_info[current_index] = mmfilter.GetMinimum()
current_index = current_index + 1
image_info[current_index] = mmfilter.GetMaximum()
current_index = current_index + 1
else: # either a color image or a greyscale image masquerading as a color one
pixel_type = sitk_image.GetPixelIDTypeAsString()
channels = [
sitk.GetArrayFromImage(sitk.VectorIndexSelectionCast(sitk_image, i))
for i in range(sitk_image.GetNumberOfComponentsPerPixel())
]
if np.array_equal(channels[0], channels[1]) and np.array_equal(
channels[0], channels[2]
):
pixel_type = pixel_type + " {0} channels gray".format(
sitk_image.GetNumberOfComponentsPerPixel()
)
else:
pixel_type = pixel_type + " {0} channels color".format(
sitk_image.GetNumberOfComponentsPerPixel()
)
image_info[current_index] = pixel_type
current_index = current_index + 3
img_keys = sitk_image.GetMetaDataKeys()
for k in meta_data_keys:
if k in img_keys:
image_info[current_index] = sitk_image.GetMetaData(k)
current_index = current_index + 1
return current_index
def inspect_single_file(file_name, imageIO="", meta_data_keys=[], external_programs=[]):
"""
Inspect a file using the specified imageIO, returning a list with the relevant information.
Parameters
----------
file_name (str): Image file name.
imageIO (str): Name of image IO to use. To see the list of registered image IOs use the
ImageFileReader::GetRegisteredImageIOs() or print an ImageFileReader.
The empty string indicates to read all file formats supported by SimpleITK.
meta_data_keys(list(str)): The image's meta-data dictionary keys whose value we want to
inspect.
external_programs(list(str)): A list of programs we will run with the file_name as input
the return value 'succeeded' or 'failed' is recorded. This is useful
for example if you need to validate conformance to a standard
such as DICOM.
Returns
-------
list with the following entries: [file name, MD5 intensity hash,
image size, image spacing, image origin, axis direction,
pixel type, min intensity, max intensity,
meta data_1...meta_data_n,
external_program_res_1...external_program_res_m]
If the given file is not readable by SimpleITK, the only meaningful entry in the list
will be the file name (all other values will be either None or NaN).
"""
file_info = [None] * (9 + len(meta_data_keys) + len(external_programs))
file_info[0] = file_name
current_index = 1
try:
reader = sitk.ImageFileReader()
reader.SetImageIO(imageIO)
reader.SetFileName(file_name)
img = reader.Execute()
current_index = inspect_image(img, file_info, current_index, meta_data_keys)
for p in external_programs:
try:
# run the external programs, check the return value, and capture all output so it
# doesn't appear on screen. The CalledProcessError exception is raised if the
# external program fails (returns non zero value).
subprocess.run([p, file_name], check=True, capture_output=True)
file_info[current_index] = "succeeded"
except:
file_info[current_index] = "failed"
current_index = current_index + 1
except:
pass
return file_info
def inspect_files(
root_dir,
imageIO="",
meta_data_keys=[],
external_programs=[],
additional_column_names=[],
):
"""
Iterate over a directory structure and return a pandas dataframe with the relevant information for the
image files. This also includes non image files. The resulting dataframe will only include the file name
if that file wasn't successfuly read by SimpleITK. The two reasons for failure are: (1) the user specified
imageIO isn't compatible with the file format (user is only interested in reading jpg and the file
format is mha) or (2) the file could not be read by the SimpleITK IO (corrupt file or unexpected limitation of
SimpleITK).
Parameters
----------
root_dir (str): Path to the root of the data directory. Traverse the directory structure
and inspect every file (also report non image files, in which
case the only valid entry will be the file name).
imageIO (str): Name of image IO to use. To see the list of registered image IOs use the
ImageFileReader::GetRegisteredImageIOs() or print an ImageFileReader.
The empty string indicates to read all file formats supported by SimpleITK.
meta_data_keys(list(str)): The image's meta-data dictionary keys whose value we want to
inspect.
external_programs(list(str)): A list of programs we will run with the file_name as input
the return value 'succeeded' or 'failed' is recorded. This
is useful for example if you need to validate conformance
to a standard such as DICOM.
additional_column_names (list(str)): Column names corrosponding to the contents of the
meta_data_keys and external_programs lists.
Returns
-------
pandas DataFrame: Each row in the data frame corresponds to a single file.
"""
if len(meta_data_keys) + len(external_programs) != len(additional_column_names):
raise ValueError("Number of additional column names does not match expected.")
column_names = [
"file name",
"MD5 intensity hash",
"image size",
"image spacing",
"image origin",
"axis direction",
"pixel type",
"min intensity",
"max intensity",
] + additional_column_names
all_file_names = []
for dir_name, subdir_names, file_names in os.walk(root_dir):
all_file_names += [
os.path.join(os.path.abspath(dir_name), fname) for fname in file_names
]
# Get list of lists describing the results and then combine into a dataframe, faster
# than appending to the dataframe one by one. Use parallel processing to speed things up.
if platform.system() == "Windows":
res = map(
partial(
inspect_single_file,
imageIO=imageIO,
meta_data_keys=meta_data_keys,
external_programs=external_programs,
),
all_file_names,
)
else:
with mp.Pool(processes=MAX_PROCESSES) as pool:
res = pool.map(
partial(
inspect_single_file,
imageIO=imageIO,
meta_data_keys=meta_data_keys,
external_programs=external_programs,
),
all_file_names,
)
return pd.DataFrame(res, columns=column_names)
def inspect_single_series(series_data, meta_data_keys=[]):
"""
Inspect a single DICOM series (DICOM heirarchy of patient-study-series-image).
This can be a single file, or multiple files such as a CT or
MR volume.
Parameters
----------
series_data (two entry tuple): First entry is study:series, second entry is the list of
files comprising this series.
meta_data_keys(list(str)): The image's meta-data dictionary keys whose value we want to
inspect.
Returns
-------
list with the following entries: [study:series, MD5 intensity hash,
image size, image spacing, image origin, axis direction,
pixel type, min intensity, max intensity,
meta data_1...meta_data_n]
"""
series_info = [None] * (9 + len(meta_data_keys))
series_info[0] = series_data[1]
current_index = 1
try:
reader = sitk.ImageSeriesReader()
reader.MetaDataDictionaryArrayUpdateOn()
reader.LoadPrivateTagsOn()
_, sid = series_data[0].split(":")
file_names = series_data[1]
# As the files comprising a series with multiple files can reside in
# separate directories and SimpleITK expects them to be in a single directory
# we use a tempdir and symbolic links to enable SimpleITK to read the series as
# a single image. Additionally the files are renamed as they may have resided in
# separate directories with the same file name. Finally, unfortunately on windows
# we copy the files to the tempdir as the os.symlink documentation says that
# "On newer versions of Windows 10, unprivileged accounts can create symlinks
# if Developer Mode is enabled. When Developer Mode is not available/enabled,
# the SeCreateSymbolicLinkPrivilege privilege is required, or the process must be
# run as an administrator."
with tempfile.TemporaryDirectory() as tmpdirname:
if platform.system() == "Windows":
for i, fname in enumerate(file_names):
shutil.copy(
os.path.abspath(fname), os.path.join(tmpdirname, str(i))
)
else:
for i, fname in enumerate(file_names):
os.symlink(os.path.abspath(fname), os.path.join(tmpdirname, str(i)))
reader.SetFileNames(
sitk.ImageSeriesReader_GetGDCMSeriesFileNames(tmpdirname, sid)
)
img = reader.Execute()
for k in meta_data_keys:
if reader.HasMetaDataKey(0, k):
img.SetMetaData(k, reader.GetMetaData(0, k))
inspect_image(img, series_info, current_index, meta_data_keys)
except:
pass
return series_info
def inspect_series(root_dir, meta_data_keys=[], additional_column_names=[]):
"""
Inspect all series found in the directory structure. A series does not have to
be in a single directory (the files are located in the subtree and combined
into a single image).
Parameters
----------
root_dir (str): Path to the root of the data directory. Traverse the directory structure
and inspect every series. If the series is comprised of multiple image files
they do not have to be in the same directory. The only expectation is that all
images from the series are under the root_dir.
meta_data_keys(list(str)): The series meta-data dictionary keys whose value we want to
inspect.
additional_column_names (list(str)): Column names corrosponding to the contents of the
meta_data_keys list.
Returns
-------
pandas DataFrame: Each row in the data frame corresponds to a single file.
"""
if len(meta_data_keys) != len(additional_column_names):
raise ValueError("Number of additional column names does not match expected.")
column_names = [
"files",
"MD5 intensity hash",
"image size",
"image spacing",
"image origin",
"axis direction",
"pixel type",
"min intensity",
"max intensity",
] + additional_column_names
all_series_files = {}
reader = sitk.ImageFileReader()
# collect the file names of all series into a dictionary with the key being
# study:series.
for dir_name, subdir_names, file_names in os.walk(root_dir):
sids = sitk.ImageSeriesReader_GetGDCMSeriesIDs(dir_name)
for sid in sids:
file_names = sitk.ImageSeriesReader_GetGDCMSeriesFileNames(dir_name, sid)
reader.SetFileName(file_names[0])
reader.ReadImageInformation()
study = reader.GetMetaData("0020|000d")
key = "{0}:{1}".format(study, sid)
if key in all_series_files:
all_series_files[key].extend(file_names)
else:
all_series_files[key] = list(file_names)
# Get list of lists describing the results and then combine into a dataframe, faster
# than appending to the dataframe one by one.
res = [
inspect_single_series(series_data, meta_data_keys)
for series_data in all_series_files.items()
]
return pd.DataFrame(res, columns=column_names)
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"root_of_data_directory", help="path to the topmost directory containing data"
)
parser.add_argument("output_file", help="output csv file path")
parser.add_argument(
"analysis_type",
default="per_file",
help='type of analysis, "per_file" or "per_series"',
)
parser.add_argument(
"--imageIO",
default="",
help="SimpleITK imageIO to use for reading (e.g. BMPImageIO)",
)
parser.add_argument(
"--external_applications",
default=[],
nargs="*",
help="paths to external applications",
)
parser.add_argument(
"--external_applications_headings",
default=[],
nargs="*",
help="titles of the results columns for external applications",
)
parser.add_argument(
"--metadata_keys",
nargs="*",
default=[],
help="inspect values of these metadata keys (DICOM tags or other keys stored in the file)",
)
parser.add_argument(
"--metadata_keys_headings",
default=[],
nargs="*",
help="titles of the results columns for the metadata_keys",
)
args = parser.parse_args(argv)
if len(args.external_applications) != len(args.external_applications_headings):
print("Number of external applications and their headings do not match.")
sys.exit(1)
if len(args.metadata_keys) != len(args.metadata_keys_headings):
print("Number of metadata keys and their headings do not match.")
sys.exit(1)
if args.analysis_type not in ["per_file", "per_series"]:
print("Unexpected analysis type.")
sys.exit(1)
if args.analysis_type == "per_file":
df = inspect_files(
args.root_of_data_directory,
imageIO=args.imageIO,
meta_data_keys=args.metadata_keys,
external_programs=args.external_applications,
additional_column_names=args.metadata_keys_headings
+ args.external_applications_headings,
)
df.to_csv(args.output_file, index=False)
sys.exit(0)
if args.analysis_type == "per_series":
df = inspect_series(
args.root_of_data_directory,
meta_data_keys=args.metadata_keys,
additional_column_names=args.metadata_keys_headings,
)
df.to_csv(args.output_file, index=False)
sys.exit(0)
if __name__ == "__main__":
sys.exit(main())