From 64647571ab6ec420556108fd1cae715bb2c0ebd8 Mon Sep 17 00:00:00 2001 From: RichardObi Date: Tue, 30 Apr 2024 20:01:23 +0200 Subject: [PATCH] renaming of project due to pypi availability. Implementation of multiprocessing of feature extraction. --- .idea/.name | 1 + .idea/{frd.iml => frd-score.iml} | 0 README.md | 18 +- setup.py | 12 +- src/frd/__main__.py | 3 - src/{frd => frd_score}/__init__.py | 0 src/frd_score/__main__.py | 3 + src/{frd/frd_score.py => frd_score/frd.py} | 309 +++++++++++---------- tests/test_frd.py | 16 +- 9 files changed, 190 insertions(+), 172 deletions(-) create mode 100644 .idea/.name rename .idea/{frd.iml => frd-score.iml} (100%) delete mode 100644 src/frd/__main__.py rename src/{frd => frd_score}/__init__.py (100%) create mode 100644 src/frd_score/__main__.py rename src/{frd/frd_score.py => frd_score/frd.py} (83%) diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 0000000..3c9ce45 --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +frd_score \ No newline at end of file diff --git a/.idea/frd.iml b/.idea/frd-score.iml similarity index 100% rename from .idea/frd.iml rename to .idea/frd-score.iml diff --git a/README.md b/README.md index f3522f8..9cf3838 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ - + # Fréchet Radiomics Distance (FRD) @@ -18,11 +18,11 @@ In general, the variability (e.g. measured via FRD) of imaging biomarkers (e.g. ## Installation - + Install frd: ``` -pip install git+https://github.com/RichardObi/frd +pip install frd-score ``` Requirements: @@ -41,23 +41,23 @@ Requirements: To compute the FID score between two datasets, where images of each dataset are contained in an individual folder: ``` -python -m frd path/to/dataset_A path/to/dataset_B +python -m frd_score path/to/dataset_A path/to/dataset_B ``` If you would like to use masks to localize radiomics features, you can provide the path to the masks as follows: ``` -python -m frd path/to/dataset_A path/to/dataset_B -M path/to/mask_A path/to/mask_B +python -m frd_score path/to/dataset_A path/to/dataset_B -M path/to/mask_A path/to/mask_B ``` ### Run in your code: If you would like to import frd as a module, you can use the following code snippet: ``` -from frd import frd_score +from frd_score import frd paths=['path/to/dataset_A', 'path/to/dataset_B'] paths_masks=[path_mask_A, path_mask_B] # optionally, we can add masks. -frd_value = frd_score.compute_frd(paths, paths_masks=paths_masks) +frd_value = frd.compute_frd(paths, paths_masks=paths_masks) ``` ## Additional arguments @@ -78,11 +78,13 @@ frd_value = frd_score.compute_frd(paths, paths_masks=paths_masks) `--verbose` or `-v`: You may enable more detailed logging.info and logging.debug console logs by providing the `verbose` argument. +`--num_workers` or `-w`: The number of cpu workers used for multiprocessing during feature extraction. If set to None, then the system's number of available cpu cores minus 2 will be taken as default (1 is the minimum value for num_workers). + `--save-stats` or `-s`: As in [pytorch-fid](https://github.com/mseitzer/pytorch-fid), you can generate a compatible `.npz` archive of a dataset using the `--save-stats` flag. You may use the `.npz` archive as dataset path, which can be useful to compare multiple models against an original dataset without recalculating the statistics multiple times. ``` -python -m frd --save-stats path/to/dataset path/to/npz_outputfile +python -m frd_score --save-stats path/to/dataset path/to/npz_outputfile ``` diff --git a/setup.py b/setup.py index cb73581..9bbff9d 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,8 @@ def get_version(rel_path): if __name__ == "__main__": setuptools.setup( - name="frd", - version=get_version(os.path.join("src", "frd", "__init__.py")), + name="frd-score", + version=get_version(os.path.join("src", "frd_score", "__init__.py")), author="Richard Osuala, Preeti Verma", description=( "Package for calculating Fréchet Radiomics Distance (FRD)" @@ -32,10 +32,10 @@ def get_version(rel_path): long_description=read("README.md"), license="Apache-2.0", long_description_content_type="text/markdown", - url="https://github.com/RichardObi/frd", - download_url="https://github.com/RichardObi/frd/archive/refs/tags/v0.0.1.tar.gz", + url="https://github.com/RichardObi/frd_score", + download_url="https://github.com/RichardObi/frd_score/archive/refs/tags/v0.0.1.tar.gz", project_urls={ - "Bug Tracker": "https://github.com/RichardObi/frd/issues", + "Bug Tracker": "https://github.com/RichardObi/frd_score/issues", }, package_dir={"": "src"}, packages=setuptools.find_packages(where="src"), @@ -47,7 +47,7 @@ def get_version(rel_path): python_requires=">=3.8", entry_points={ "console_scripts": [ - "frd = frd.frd_score:main", + "frd = frd_score.frd:main", ], }, install_requires=[ diff --git a/src/frd/__main__.py b/src/frd/__main__.py deleted file mode 100644 index 4b5e695..0000000 --- a/src/frd/__main__.py +++ /dev/null @@ -1,3 +0,0 @@ -import frd.frd_score - -frd.frd_score.main() diff --git a/src/frd/__init__.py b/src/frd_score/__init__.py similarity index 100% rename from src/frd/__init__.py rename to src/frd_score/__init__.py diff --git a/src/frd_score/__main__.py b/src/frd_score/__main__.py new file mode 100644 index 0000000..bcd0f9b --- /dev/null +++ b/src/frd_score/__main__.py @@ -0,0 +1,3 @@ +import frd_score.frd + +frd_score.frd.main() diff --git a/src/frd/frd_score.py b/src/frd_score/frd.py similarity index 83% rename from src/frd/frd_score.py rename to src/frd_score/frd.py index 64b4cec..e570f39 100644 --- a/src/frd/frd_score.py +++ b/src/frd_score/frd.py @@ -19,6 +19,10 @@ import pathlib import time from pathlib import Path +import multiprocessing as mp +from functools import partial + + import cv2 import numpy as np @@ -158,9 +162,129 @@ def parse_args() -> argparse.Namespace: "This can be useful for reproducibility and interpretability.", ) + parser.add_argument( + "-w", + "--num_workers", + type=int, + default=None, + help="The number of cpu workers used for multiprocessing during feature extraction. " + "If set to None, then the system's number of available cpu cores minus 2 will be taken as default. ", + ) + args = parser.parse_args() return args +def resize_image_array(sitk_image_array, resize_size, interpolation=cv2.INTER_LINEAR): + if len(sitk_image_array.shape) == 2: + sitk_image_array_resized = cv2.resize( + sitk_image_array, (resize_size, resize_size), interpolation=interpolation + ) + elif len(sitk_image_array.shape) == 3: + # Going through z axis (which should be at index position 0 here in sitk image, and resizing each slice + sitk_image_array_resized = np.zeros( + (sitk_image_array.shape[0], resize_size, resize_size) + ) + for j in range(sitk_image_array.shape[0]): + sitk_image_array_resized[j] = cv2.resize( + sitk_image_array[j], + (resize_size, resize_size), + interpolation=interpolation, + ) + else: + raise ValueError( + f"SITK Image array has an unexpected shape: {sitk_image_array.shape}. Expected 2D or 3D array (no channel dim). Got {len(sitk_image_array.shape)}" + ) + return sitk_image_array_resized + +def process_image_mask_pair(image_mask_pair, resize_size=None, feature_extractor=None, verbose=False): + image_path, mask_path = image_mask_pair + #for i, (image_path, mask_path) in enumerate(zip(image_paths, mask_paths)): + sitk_image = sitk.ReadImage( + str(image_path), outputPixelType=sitk.sitkFloat32 + ) + if mask_path is None: + # https://discourse.slicer.org/t/features-extraction/11047/3 + ma_arr = np.ones(sitk_image.GetSize()[::-1]).astype( + np.uint8 + ) # reverse the order as image is xyz, array is zyx + sitk_mask = sitk.GetImageFromArray(ma_arr) + try: + sitk_mask.CopyInformation(sitk_image) # Copy geometric info + except Exception as e: + logging.debug( + f"Error while trying to copy information from image to mask: {e}" + ) + pass + if verbose: + logging.debug( + "Empty mask (true everywhere) is used for feature extraction." + ) + else: + sitk_mask = sitk.ReadImage(str(mask_path)) + + # Check if the mask is in range [0, 255] and rescale it to [0, 1] + if np.max(sitk.GetArrayViewFromImage(sitk_mask)) == 255: + sitk_mask = sitk.Cast(sitk_mask, sitk.sitkFloat32) / 255.0 + + if resize_size is not None: + sitk_image_array = sitk.GetArrayViewFromImage(sitk_image) + sitk_image_array_resized = resize_image_array( + sitk_image_array, resize_size, interpolation=cv2.INTER_LINEAR + ) + sitk_image_resized = sitk.GetImageFromArray(sitk_image_array_resized) + try: + sitk_image_resized.CopyInformation(sitk_image) + except: + pass + sitk_image = ( + sitk_image_resized # Update the image to the resized version + ) + + sitk_mask_array = sitk.GetArrayViewFromImage(sitk_mask) + sitk_mask_array_resized = resize_image_array( + sitk_mask_array, resize_size, interpolation=cv2.INTER_LINEAR + ) + # After resizing, set all values above 0.5 to 1 and all values below to 0 + sitk_mask_array_resized[sitk_mask_array_resized > 0.5] = 1 + sitk_mask_array_resized[sitk_mask_array_resized <= 0.5] = 0 + sitk_mask_resized = sitk.GetImageFromArray(sitk_mask_array_resized) + try: + sitk_mask_resized.CopyInformation(sitk_mask) + except: + pass + sitk_mask = sitk_mask_resized + + # Check if the mask contains only one voxel. This needs to be done before and after resizing as the mask + if np.sum(sitk.GetArrayViewFromImage(sitk_mask)) <= 1: + if verbose: + logging.info( + f"Skipping mask (after potentially having applied resizing to {resize_size}) with only one segmented voxel:", + mask_path, + ) + return + + # Finally, run the feature extraction + try: + output = feature_extractor.execute(sitk_image, sitk_mask) + except Exception as e: + logging.debug(f"sitk_mask: {(sitk.GetArrayViewFromImage(sitk_mask))}") + logging.debug(f"sitk_image: {(sitk.GetArrayViewFromImage(sitk_image))}") + logging.debug( + f"shape sitk_mask: {(sitk.GetArrayViewFromImage(sitk_mask)).shape} and shape sitk_image: {(sitk.GetArrayViewFromImage(sitk_image)).shape}" + ) + logging.error( + f"Error occurred while extracting features for image {image_path} and mask {mask_path}: {e}" + ) + raise e + radiomics_features = {} + for feature_name in output.keys(): + if "diagnostics" not in feature_name: + radiomics_features[feature_name.replace("original_", "")] = float( + output[feature_name] + ) + radiomics_feature_values_list = list(radiomics_features.values()) + #pbar.update(1) + return radiomics_feature_values_list, radiomics_features, len(radiomics_feature_values_list) def compute_features( files, @@ -168,6 +292,7 @@ def compute_features( masks=None, resize_size=None, verbose=False, + num_workers = None, ): """Calculates the features of the given query image (optionally, alongside a respective segmentation mask). @@ -182,13 +307,9 @@ def compute_features( -- A numpy array of dimension (num images, num_features) that contains the extracted features of the given query image (optionally, alongside a respective segmentation mask). """ - prediction_array = None - image_paths = [] mask_paths = [] - radiomics_results = [] for idx, file_path in enumerate(files): - image_paths.append(file_path) if masks is not None: # Note: Sorting assumption here: Masks and images are in separate folders. Each image has a mask and # mask and image file are named similarly enough that sorting assures correspondence between image and mask index positions. @@ -205,155 +326,38 @@ def compute_features( else: mask_paths.append(None) - total = len(image_paths) - - with tqdm(total=total) as pbar: - for i, (image_path, mask_path) in enumerate(zip(image_paths, mask_paths)): - sitk_image = sitk.ReadImage( - str(image_path), outputPixelType=sitk.sitkFloat32 - ) - if mask_path is None: - # https://discourse.slicer.org/t/features-extraction/11047/3 - ma_arr = np.ones(sitk_image.GetSize()[::-1]).astype( - np.uint8 - ) # reverse the order as image is xyz, array is zyx - sitk_mask = sitk.GetImageFromArray(ma_arr) - try: - sitk_mask.CopyInformation(sitk_image) # Copy geometric info - except Exception as e: - logging.debug( - f"Error while trying to copy information from image to mask: {e}" - ) - pass - if verbose: - logging.debug( - "Empty mask (true everywhere) is used for feature extraction." - ) - else: - sitk_mask = sitk.ReadImage(str(mask_path)) + if verbose: + # get some logging.infos to check the progress and if everything is working + logging.info( + f"Now processing corresponding image-mask pairs (e.g., IMG[0]:{files[0]}, MASK[0]: {mask_paths[0]}. Do these correspond?" + ) - # Check if the mask is in range [0, 255] and rescale it to [0, 1] - if np.max(sitk.GetArrayViewFromImage(sitk_mask)) == 255: - sitk_mask = sitk.Cast(sitk_mask, sitk.sitkFloat32) / 255.0 + # multiprocessing feature extraction + if num_workers is None: num_workers = mp.cpu_count() - 2 # dont use all cpus. + if num_workers < 1: num_workers = 1 + process_pairs=partial(process_image_mask_pair, resize_size=resize_size, feature_extractor=feature_extractor, verbose=verbose) + with mp.Pool(processes=num_workers) as p: + # features_list, list_radiomics_feature_dict_list, num_features_list = tqdm(p.imap(process_pairs, zip(files, mask_paths)), total=len(files)) + results = tqdm(p.imap(process_pairs, zip(files, mask_paths)), total=len(files)) + #results = p.map(process_pairs, zip(files, mask_paths)) - if verbose and i % 100 == 0: - # get some logging.infos to check the progress and if everything is working - logging.info( - f"Now processing corresponding image-mask pair (IMG:{image_path}, MASK: {mask_path}. Do these correspond?" - ) + # get the first element of results + features_list = [] + list_radiomics_feature_dict_list = [] + for result in results: + features_list.append(result[0]) + list_radiomics_feature_dict_list.append(result[1]) - if resize_size is not None: - sitk_image_array = sitk.GetArrayViewFromImage(sitk_image) - sitk_image_array_resized = resize_image_array( - sitk_image_array, resize_size, interpolation=cv2.INTER_LINEAR - ) - sitk_image_resized = sitk.GetImageFromArray(sitk_image_array_resized) - try: - sitk_image_resized.CopyInformation(sitk_image) - except: - pass - sitk_image = ( - sitk_image_resized # Update the image to the resized version - ) + if verbose: logging.info(f"Number of radiomics features: {len(features_list[0])}") - sitk_mask_array = sitk.GetArrayViewFromImage(sitk_mask) - sitk_mask_array_resized = resize_image_array( - sitk_mask_array, resize_size, interpolation=cv2.INTER_LINEAR - ) - # After resizing, set all values above 0.5 to 1 and all values below to 0 - sitk_mask_array_resized[sitk_mask_array_resized > 0.5] = 1 - sitk_mask_array_resized[sitk_mask_array_resized <= 0.5] = 0 - sitk_mask_resized = sitk.GetImageFromArray(sitk_mask_array_resized) - try: - sitk_mask_resized.CopyInformation(sitk_mask) - except: - pass - sitk_mask = sitk_mask_resized - - # Check if the mask contains only one voxel. This needs to be done before and after resizing as the mask - if np.sum(sitk.GetArrayViewFromImage(sitk_mask)) <= 1: - if verbose: - logging.info( - f"Skipping mask (after potentially having applied resizing to {resize_size}) with only one segmented voxel:", - mask_path, - ) - continue + # processing result in numpy array + pred_arr = np.asarray(features_list) #np.empty(len(image_paths), num_features_list) - # Finally, run the feature extraction - try: - output = feature_extractor.execute(sitk_image, sitk_mask) - except Exception as e: - logging.debug(f"sitk_mask: {(sitk.GetArrayViewFromImage(sitk_mask))}") - logging.debug(f"sitk_image: {(sitk.GetArrayViewFromImage(sitk_image))}") - logging.debug( - f"shape sitk_mask: {(sitk.GetArrayViewFromImage(sitk_mask)).shape} and shape sitk_image: {(sitk.GetArrayViewFromImage(sitk_image)).shape}" - ) - logging.error( - f"Error occurred while extracting features for image {i} from image {image_path} and mask {mask_path}: {e}" - ) - raise e - radiomics_features = {} - for feature_name in output.keys(): - if "diagnostics" not in feature_name: - radiomics_features[feature_name.replace("original_", "")] = float( - output[feature_name] - ) - radiomics_results.append(radiomics_features) - if verbose: - logging.debug( - f"img_shape:{sitk.GetArrayViewFromImage(sitk_image).shape}, features: {len(list(radiomics_features.values()))}" - ) + return pred_arr, list_radiomics_feature_dict_list, mask_paths - try: - # We check if pred_arr is defined already. - pred_arr - except NameError: - # As pred_arr was not yet defined we initialize it here based on the length/dimensionality of radiomics features - pred_arr = np.empty( - (len(image_paths), len(list(radiomics_features.values()))) - ) - pred_arr[i] = list(radiomics_features.values()) - if verbose: - logging.debug( - f"Total number of features extracted for image {i}: {len(pred_arr[i])}" - ) - pbar.update(1) - if radiomics_results and verbose: - logging.info(f"Number of radiomics features: {len(radiomics_results[0])}") - try: - prediction_array = pred_arr - except NameError: - logging.warning( - f"Error: No features extracted from the image files in the dataset {idx}. " - f"Last extracted radiomics features were: {radiomics_results[-1]} " - ) - return prediction_array, radiomics_results, image_paths, mask_paths - - -def resize_image_array(sitk_image_array, resize_size, interpolation=cv2.INTER_LINEAR): - if len(sitk_image_array.shape) == 2: - sitk_image_array_resized = cv2.resize( - sitk_image_array, (resize_size, resize_size), interpolation=interpolation - ) - elif len(sitk_image_array.shape) == 3: - # Going through z axis (which should be at index position 0 here in sitk image, and resizing each slice - sitk_image_array_resized = np.zeros( - (sitk_image_array.shape[0], resize_size, resize_size) - ) - for j in range(sitk_image_array.shape[0]): - sitk_image_array_resized[j] = cv2.resize( - sitk_image_array[j], - (resize_size, resize_size), - interpolation=interpolation, - ) - else: - raise ValueError( - f"SITK Image array has an unexpected shape: {sitk_image_array.shape}. Expected 2D or 3D array (no channel dim). Got {len(sitk_image_array.shape)}" - ) - return sitk_image_array_resized def save_features_to_csv(csv_file_path, image_paths, mask_paths, feature_data): @@ -604,6 +608,7 @@ def calculate_feature_statistics( verbose: bool = False, save_features: bool = False, norm_sets_separately: bool = True, + num_workers=None, ) -> (list, list): """Calculation of the statistics used by the FRD. Params: @@ -623,12 +628,13 @@ def calculate_feature_statistics( feature_list = [] for idx, file_list in enumerate(file_lists): - features, radiomics_results, image_paths, mask_paths = compute_features( + features, radiomics_results, mask_paths = compute_features( files=file_list, feature_extractor=feature_extractor, masks=mask_lists[idx] if mask_lists is not None else None, resize_size=resize_size, verbose=verbose, + num_workers=num_workers, ) if verbose: logging.debug(f"features of radiomics: {features}") @@ -713,10 +719,10 @@ def calculate_feature_statistics( f"radiomics_set{idx}_results_normalized_{folder_name}_{unique_identifier}.csv", ) save_features_to_csv( - csv_file_path, image_paths, mask_paths, radiomics_results + csv_file_path, file_list, mask_paths, radiomics_results ) save_features_to_csv( - norm_csv_file_path, image_paths, mask_paths, radiomics_results + norm_csv_file_path, file_list, mask_paths, radiomics_results ) return mu_list, sigma_list @@ -732,6 +738,7 @@ def compute_statistics_of_paths( verbose=False, save_features=False, norm_sets_separately=True, + num_workers=None, ): """Calculates the statistics later used to compute the Frechet Distance for a given paths (i.e. one of the two distributions).""" @@ -784,6 +791,7 @@ def compute_statistics_of_paths( verbose=verbose, save_features=save_features, norm_sets_separately=norm_sets_separately, + num_workers=num_workers, ) @@ -831,6 +839,7 @@ def compute_frd( verbose=False, save_features=True, norm_sets_separately=True, + num_workers=None, ): """Calculates the FRD based on the statistics from the two paths (i.e. the two distributions) Params: @@ -870,6 +879,7 @@ def compute_frd( verbose=verbose, save_features=save_features, norm_sets_separately=norm_sets_separately, + num_workers=num_workers, ) if verbose: @@ -895,6 +905,8 @@ def save_frd_stats( resize_size=None, verbose=False, save_features=True, + num_workers=None, + ): """Inits feature extractor creation and subsequent statistics computation and saving for the two distributions.""" @@ -927,6 +939,7 @@ def save_frd_stats( resize_size=resize_size, verbose=verbose, save_features=save_features, + num_workers=num_workers, ) np.savez_compressed(paths[1], mu=mu_list[0], sigma=sigma_list[0]) @@ -955,6 +968,7 @@ def main(): resize_size=args.resize_size, verbose=args.verbose, save_features=args.save_features, + num_workers=args.num_workers, ) return @@ -968,6 +982,7 @@ def main(): verbose=args.verbose, save_features=args.save_features, norm_sets_separately=not args.norm_across, + num_workers=args.num_workers, ) # logging the result logging.info( diff --git a/tests/test_frd.py b/tests/test_frd.py index 80da9e0..c062672 100644 --- a/tests/test_frd.py +++ b/tests/test_frd.py @@ -12,7 +12,7 @@ import numpy as np from PIL import Image -from src.frd import frd_score +from src.frd_score import frd class TestFRD: @@ -49,7 +49,7 @@ def test_frd_2d(self): Path(path_a).mkdir(exist_ok=True) Path(path_b).mkdir(exist_ok=True) - for ext in frd_score.IMAGE_EXTENSIONS: + for ext in frd.IMAGE_EXTENSIONS: if ext != "nii.gz": # create image with different extensions in_image_1.save(f"{path_a}/img.{ext}") @@ -71,7 +71,7 @@ def test_frd_2d(self): # Test if this function raises error try: paths = [path_a, f"{path_a}/statistics.npz"] - frd_score.save_frd_stats( + frd.save_frd_stats( paths, features, norm_type, @@ -87,7 +87,7 @@ def test_frd_2d(self): # Test if this function raises error try: paths = [path_a, path_b] - frd_value = frd_score.compute_frd( + frd_value = frd.compute_frd( paths, features, norm_type, @@ -158,7 +158,7 @@ def test_frd_3d(self): try: # Should get very high FRD value paths = [path_a, path_b] - frd_value = frd_score.compute_frd( + frd_value = frd.compute_frd( paths, features, norm_type, @@ -182,7 +182,7 @@ def test_frd_3d(self): paths = [path_a, path_a] norm_type = "minmax" norm_range = [0.0, 5.0] - frd_value = frd_score.compute_frd( + frd_value = frd.compute_frd( paths, features, norm_type, @@ -234,7 +234,7 @@ def test_frd_3d(self): norm_range = [0.0, 7.0] try: paths = [path_a, path_a] - frd_value = frd_score.compute_frd( + frd_value = frd.compute_frd( paths, features, norm_type, @@ -257,7 +257,7 @@ def test_frd_3d(self): ### Try different images with different masks try: paths = [path_a, path_b] - frd_value = frd_score.compute_frd( + frd_value = frd.compute_frd( paths, features, norm_type,