From aa2ae4543b6c3e0a439d7f2f89382d2a2a14eec9 Mon Sep 17 00:00:00 2001 From: xavier-owkin Date: Tue, 19 Dec 2023 16:38:06 +0000 Subject: [PATCH] Add option for using other folder in dataset creation script --- .../dataset_creation_scripts/tiling_slides.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/flamby/datasets/fed_camelyon16/dataset_creation_scripts/tiling_slides.py b/flamby/datasets/fed_camelyon16/dataset_creation_scripts/tiling_slides.py index 0acdce2dd..529442081 100644 --- a/flamby/datasets/fed_camelyon16/dataset_creation_scripts/tiling_slides.py +++ b/flamby/datasets/fed_camelyon16/dataset_creation_scripts/tiling_slides.py @@ -52,7 +52,9 @@ def __len__(self): def __getitem__(self, idx): pil_image = self.slide.read_region( - self.coords[idx].astype("int_"), self.level, (self.tile_size, self.tile_size) + self.coords[idx].astype("int_"), + self.level, + (self.tile_size, self.tile_size), ).convert("RGB") if self.transform is not None: pil_image = self.transform(pil_image) @@ -68,7 +70,7 @@ def save_dict_to_csv(dict_arg, file_name): df.to_csv(file_name, index=False) -def main(batch_size, num_workers_torch, tile_from_scratch, remove_big_tiff): +def main(batch_size, num_workers_torch, tile_from_scratch, remove_big_tiff, output_path): """Function tiling slides that have been downloaded using download.py. Parameters @@ -83,9 +85,12 @@ def main(batch_size, num_workers_torch, tile_from_scratch, remove_big_tiff): If this option is activated we disregard the csv files with precomputed coordinates. - remove_big_tiff : bool + remove_big_tiff: bool Whether or not to get rid of all original slides after tiling. + output_path: str + An optional path to store the dataset after tiling + Raises ------ ValueError @@ -125,6 +130,10 @@ def main(batch_size, num_workers_torch, tile_from_scratch, remove_big_tiff): sys.exit() slides_dir = dict["dataset_path"] + if output_path is not None: + output_slides_dir = output_path + else: + output_slides_dir = slides_dir slides_paths = glob(os.path.join(slides_dir, "*.tif")) grid_tiles_extractor = GridTiler( tile_size=(224, 224), @@ -167,7 +176,7 @@ def main(batch_size, num_workers_torch, tile_from_scratch, remove_big_tiff): for sp in tqdm(slides_paths): slide_name = os.path.basename(sp) - path_to_features = os.path.join(slides_dir, slide_name + ".npy") + path_to_features = os.path.join(output_slides_dir, slide_name + ".npy") if os.path.exists(path_to_features): continue print(f"Tiling slide {sp}") @@ -247,6 +256,8 @@ def main(batch_size, num_workers_torch, tile_from_scratch, remove_big_tiff): np.save(path_to_features, features) write_value_in_config(config_file, "preprocessing_complete", True) + if output_path is not None: + write_value_in_config(config_file, "dataset_path", output_path) if args.remove_big_tiff: print("Removing all slides") @@ -279,6 +290,9 @@ def main(batch_size, num_workers_torch, tile_from_scratch, remove_big_tiff): help="Whether or not to remove the original slides images that take \ up to 800G, after computing the features using them.", ) + parser.add_argument( + "--output-path", type=str, help="The path where to store the tiles" + ) args = parser.parse_args() main( @@ -286,4 +300,5 @@ def main(batch_size, num_workers_torch, tile_from_scratch, remove_big_tiff): args.num_workers_torch, args.tile_from_scratch, args.remove_big_tiff, + args.output_path, )