diff --git a/jlab_datascience_toolkit/analyses/__init__.py b/jlab_datascience_toolkit/analyses/__init__.py new file mode 100644 index 0000000..375a359 --- /dev/null +++ b/jlab_datascience_toolkit/analyses/__init__.py @@ -0,0 +1,19 @@ +from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules + +# Residual analyzer: +register( + id="ResidualAnalyzer_v0", + entry_point="jlab_datascience_toolkit.analyses.residual_analyzer:ResidualAnalyzer" +) + +# Data reconstruction: +register( + id="DataReconstruction_v0", + entry_point="jlab_datascience_toolkit.analyses.data_reconstruction:DataReconstruction" +) + +# Learning Curve Visualizer: +register( + id="LearningCurveVisualizer_v0", + entry_point="jlab_datascience_toolkit.analyses.learning_curve_visualizer:LearningCurveVisualizer" +) diff --git a/jlab_datascience_toolkit/analyses/data_reconstruction.py b/jlab_datascience_toolkit/analyses/data_reconstruction.py new file mode 100644 index 0000000..4fb0eb7 --- /dev/null +++ b/jlab_datascience_toolkit/analyses/data_reconstruction.py @@ -0,0 +1,194 @@ +from jlab_datascience_toolkit.core.jdst_analysis import JDSTAnalysis +import tensorflow as tf +import numpy as np +import os +import inspect +import yaml +import logging + +class DataReconstruction(JDSTAnalysis): + ''' + Simple module that passes input data x through a model: + + x_rec = model(x) + + where model can be a (variational) Autoencoder, U-Net, Diffusion model,... + The data here is processed via the tf.dataset system, in order to efficiently handle large data sets. + + Input(s): + i) Numpy arrays / images + ii) A trained model + + Output(s): + i) Dictionary with reconstructed images and (optional) original images + ''' + + # Initialize: + #********************************************* + def __init__(self,path_to_cfg,user_config={}): + # Define the module and module name: + self.module_name = "data_reconstruction" + + # Load the configuration: + self.config = self.load_config(path_to_cfg,user_config) + + # Save this config, if a path is provided: + if 'store_cfg_loc' in self.config: + self.save_config(self.config['store_cfg_loc']) + + # General settings: + self.output_loc = self.config['output_loc'] + self.data_store_name = self.config['data_store_name'] + + # Data processing settings: + self.buffer_size = self.config['buffer_size'] + self.n_analysis_samples = self.config['n_analysis_samples'] + self.analysis_sample_size = self.config['analysis_sample_size'] + + # Get names of the data: + self.data_names = self.config['data_names'] + self.record_original_data = self.config['record_original_data'] + + self.store_data = False + if self.output_loc is not None and self.output_loc.lower() != "": + self.store_data = True + os.makedirs(self.output_loc,exist_ok=True) + #********************************************* + + # Check the input data type: + #********************************************* + def check_input_data_type(self,x=None,model_list=[]): + + if isinstance(x,np.ndarray) and isinstance(model_list,list): + pass_model_type_check = False + if len(model_list) > 0: + pass_model_type_check = True + #+++++++++++++++ + for m in model_list: + if isinstance(m,tf.keras.Model) == False: + pass_model_type_check = False + #+++++++++++++++ + + return pass_model_type_check + else: + logging.error(f">>> {self.module_name}: The provided data does not match the requirements. The first argument has to be a numpy array, Whereas the second argument should be a non-empty list with tf.keras.Model. Going to return None. <<<") + return False + #********************************************* + + # Provide information about this module: + #********************************************* + def get_info(self): + print(inspect.getdoc(self)) + #********************************************* + + # Handle configurations: + #********************************************* + # Load the config: + def load_config(self,path_to_cfg,user_config): + with open(path_to_cfg, 'r') as file: + cfg = yaml.safe_load(file) + + # Overwrite config with user settings, if provided + try: + if bool(user_config): + #++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + #++++++++++++++++++++++++ + except: + logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + + return cfg + + #----------------------------- + + # Store the config: + def save_config(self,path_to_config): + with open(path_to_config, 'w') as file: + yaml.dump(self.config, file) + #********************************************* + + # Reconstruct the data: + #********************************************* + # First, we need a model prediction: + def get_model_predictions(self,x,model_list): + # Go through all elements within the model list and collect the predictions + x_in = x + #++++++++++++++++++ + for model in model_list: + x_out = model.predict_on_batch(x_in) + x_in = x_out + #++++++++++++++++++ + + return x_out + + #------------------------------ + + # Now run the reconstruction: + def reconstruct_data(self,x,model_list): + # First, we need to create a tf data set which shows the beauty of this method: + + # Provide the option to only analyze a part of the initial data: + n_ana_samples = x.shape[0] + if self.n_analysis_samples > 0: + n_ana_samples = self.n_analysis_samples + + # If we only analyze a fraction of the data, we need to record to original data as well: + self.record_original_data = True + + tf_data = tf.data.Dataset.from_tensor_slices(x).shuffle(buffer_size=self.buffer_size).take(n_ana_samples).batch(self.analysis_sample_size) + + # Second, make sure that we have a model list: + if type(model_list) != list: + model_list = [model_list] + + # Third, make some predictions: + predictions = [] + inputs = [] + #++++++++++++++++++++++ + for sample in tf_data: + # Get the prediction: + current_pred = self.get_model_predictions(sample,model_list) + predictions.append(current_pred) + + if self.record_original_data == True: + inputs.append(sample) + #++++++++++++++++++++++ + + # Record everything: + result_dict = {} + result_dict[self.data_names[1]] = np.concatenate(predictions,axis=0) + + if self.record_original_data == True: + result_dict[self.data_names[0]] = np.concatenate(inputs,axis=0) + else: + result_dict[self.data_names[0]] = None + + return result_dict + #********************************************* + + # Run the analysis: + #********************************************* + def run(self,x,model_list): + # Run type check: + if self.check_input_data_type(x,model_list): + results = self.reconstruct_data(x,model_list) + + if self.store_data: + np.save(self.output_loc+"/"+self.data_store_name+".npy",np.array(results,dtype=object)) + + return results + + else: + return None + #********************************************* + + # Save and load are not active here: + #**************************** + def save(self): + pass + + def load(self): + pass + #**************************** + diff --git a/jlab_datascience_toolkit/analyses/learning_curve_visualizer.py b/jlab_datascience_toolkit/analyses/learning_curve_visualizer.py new file mode 100644 index 0000000..355298c --- /dev/null +++ b/jlab_datascience_toolkit/analyses/learning_curve_visualizer.py @@ -0,0 +1,175 @@ +from jlab_datascience_toolkit.core.jdst_analysis import JDSTAnalysis +import matplotlib.pyplot as plt +import os +import yaml +import inspect +import logging + +class LearningCurveVisualizer(JDSTAnalysis): + ''' + Simple class to visualize the learning curves produced during model training. + + Input(s): + i) Dictionary with all loss curves + + Output(s): + i) .png files visualizing the learning curves + ''' + + # Initialize: + #********************************************* + def __init__(self,path_to_cfg,user_config={}): + # Set the name specific to this module: + self.module_name = "learning_curve_visualizer" + + # Load the configuration: + self.config = self.load_config(path_to_cfg,user_config) + + # Get plots that shall be produced: + self.plots = self.config['plots'] + # Get the corresponding plot labels, plot legends and the names of each individual plot: + self.plot_labels = self.config['plot_labels'] + self.plot_legends = self.config['plot_legends'] + self.plot_names = self.config['plot_names'] + + # Cosmetics: + self.fig_size = self.config['fig_size'] + self.line_width = self.config['line_width'] + self.font_size = self.config['font_size'] + self.leg_font_size = self.config['leg_font_size'] + + # Set font size: + plt.rcParams.update({'font.size':self.font_size}) + + # Save this config, if a path is provided: + if 'store_cfg_loc' in self.config: + self.save_config(self.config['store_cfg_loc']) + + # Get the output location and create proper folders: + self.output_loc = self.config['output_loc'] + self.plot_loc = self.output_loc+"/learning_curves" + + os.makedirs(self.output_loc,exist_ok=True) + os.makedirs(self.plot_loc,exist_ok=True) + #********************************************* + + # Check the data type: + #********************************************* + def check_input_data_type(self,data): + if isinstance(data,dict) == True: + if bool(dict) == False: + logging.error(f">>> {self.module_name}: Your dictionary {data} is empty. Please check. Going to return None. <<<") + return False + return True + + else: + logging.error(f">>> {self.module_name}: The data type you provided {type(data)} is not a dictionary. Please check. Going to return None. <<<") + return False + #********************************************* + + # Provide information about this module: + #********************************************* + def get_info(self): + print(inspect.getdoc(self)) + #********************************************* + + # Handle configurations: + #********************************************* + # Load the config: + def load_config(self,path_to_cfg,user_config): + with open(path_to_cfg, 'r') as file: + cfg = yaml.safe_load(file) + + # Overwrite config with user settings, if provided + try: + if bool(user_config): + #++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + #++++++++++++++++++++++++ + except: + logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + + return cfg + + #----------------------------- + + # Store the config: + def save_config(self,path_to_config): + with open(path_to_config, 'w') as file: + yaml.dump(self.config, file) + #********************************************* + + # Run the entire analysis: + #********************************************* + # Peoduce a single plot, based on scores and legends: + def produce_single_plot(self,history,scores,legend_entries,axis): + if legend_entries is None: + #++++++++++++++++ + for s in scores: + if s in history: + metric = history[s] + x = [k for k in range(1,1+len(metric))] + axis.plot(x,metric,linewidth=self.line_width) + #++++++++++++++++ + + else: + #++++++++++++++++ + for s,l in zip(scores,legend_entries): + if s in history: + metric = history[s] + x = [k for k in range(1,1+len(metric))] + axis.plot(x,metric,linewidth=self.line_width,label=l) + #++++++++++++++++ + axis.legend(fontsize=self.leg_font_size) + + + def run(self,training_history): + if self.check_input_data_type(training_history): + # Loop through all plots that we wish to produce: + #+++++++++++++++++++++++ + for plot in self.plots: + # Create a canvas to draw on: + fig,ax = plt.subplots(figsize=self.fig_size) + + scores = self.plots[plot] + + legend_entries = self.plot_legends.get(plot,None) + labels = self.plot_labels.get(plot,None) + name = self.plot_names.get(plot,None) + + if legend_entries is not None: + assert len(legend_entries) == len(scores), logging.error(f">>> {self.module_name}: Number of legend entries {legend_entries} does not match the number of available score {scores} <<<") + + # Produce a nice plot: + self.produce_single_plot(training_history,scores,legend_entries,ax) + ax.grid(True) + + if labels is not None: + assert len(labels) == 2, logging.error(f">>> {self.module_name}: Number of plot labels {labels} does not match exptected number of two entries <<<") + + # Add labels if available: + ax.set_xlabel(labels[0]) + ax.set_ylabel(labels[1]) + + # Store the figure somewhere: + if name is not None: + fig.savefig(self.plot_loc+"/"+name+".png") + plt.close(fig) + #+++++++++++++++++++++++ + + else: + return None + #********************************************* + + # Save and load are not active here: + #********************************************* + def save(self): + pass + + def load(self): + pass + #********************************************* + + + diff --git a/jlab_datascience_toolkit/analyses/residual_analyzer.py b/jlab_datascience_toolkit/analyses/residual_analyzer.py new file mode 100644 index 0000000..d9ecac1 --- /dev/null +++ b/jlab_datascience_toolkit/analyses/residual_analyzer.py @@ -0,0 +1,204 @@ +from jlab_datascience_toolkit.core.jdst_analysis import JDSTAnalysis +import matplotlib.pyplot as plt +import numpy as np +import os +import inspect +import yaml +import imageio +import logging + +class ResidualAnalyzer(JDSTAnalysis): + ''' + Simple class to compare the input and reconstructed (e.g. from an autoencoder) data by computing residuals. + ''' + + # Initialize: + #**************************** + def __init__(self,path_to_cfg,user_config={}): + # Define the module and module name: + self.module_name = "residual_analyzer" + + # Load the configuration: + self.config = self.load_config(path_to_cfg,user_config) + + # Save this config, if a path is provided: + if 'store_cfg_loc' in self.config: + self.save_config(self.config['store_cfg_loc']) + + # General settings: + self.output_loc = self.config['output_loc'] + self.residual_dir = self.output_loc+"/residuals" + self.real_data_name = self.config['real_data_name'] + self.rec_data_name = self.config['rec_data_name'] + + # Define the reduction mode, i.e. how the residuals are computed: + self.reduction_mode = self.config["reduction_mode"] + self.reduction_axis = self.config["reduction_axis"] + + # Settings to plot images: + self.imageplot_figsize = self.config["imageplot_figsize"] + self.imageplot_noaxes = self.config["imageplot_noaxes"] + + # Settings to store all images as movies: + self.movie_duration = self.config["movie_duration"] + + os.makedirs(self.output_loc,exist_ok=True) + os.makedirs(self.residual_dir,exist_ok=True) + #**************************** + + # Check input data type: + #**************************** + def check_input_data_type(self,data): + if isinstance(data,dict) == True: + if bool(dict) == False: + logging.error(f">>> {self.module_name}: Your dictionary {data} is empty. Please check. Going to return None. <<<") + return False + + return True + + else: + logging.error(f">>> {self.module_name}: The data type you provided {type(data)} is not a dictionary. Please check. Going to return None. <<<") + return False + #**************************** + + # Provide information about this module: + #********************************************* + def get_info(self): + print(inspect.getdoc(self)) + #********************************************* + + # Handle configurations: + #********************************************* + # Load the config: + def load_config(self,path_to_cfg,user_config): + with open(path_to_cfg, 'r') as file: + cfg = yaml.safe_load(file) + + # Overwrite config with user settings, if provided + try: + if bool(user_config): + #++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + #++++++++++++++++++++++++ + except: + logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + + return cfg + + #----------------------------- + + # Store the config: + def save_config(self,path_to_config): + with open(path_to_config, 'w') as file: + yaml.dump(self.config, file) + #********************************************* + + # Compute the residuals: + #**************************** + # Make sure that we are operating in 3 dimensions: + def image_dimension_check(self,image): + if len(image.shape) < 4: + return np.expand_dims(image,3) + return image + + #----------------------- + + # Now compute the residuals: + def compute_residuals(self,x_real,x_rec): + residual = self.image_dimension_check(x_real) - self.image_dimension_check(x_rec) + + if self.reduction_mode.lower() == "mean": + return np.mean(residual,axis=self.reduction_axis) + elif self.reduction_mode.lower() == "abs_mean": + return np.mean(np.abs(residual),axis=self.reduction_axis) + elif self.reduction_mode.lower() == "squared_mean": + return np.mean(np.square(residual),axis=self.reduction_axis) + else: + logging.warning(f">>> {self.module_name}: Reduction mode {self.reduction_mode} does not exist. Going to use mean reduction mode (default)<<<") + return np.mean(residual,axis=self.reduction_axis) + #**************************** + + # Generic function to plot an image: + #**************************** + def plot_images(self,real_images,rec_images,residual_images,path,name): + + #++++++++++++++++++++++ + for i in range(real_images.shape[0]): + fig,ax = plt.subplots(1,3,figsize=self.imageplot_figsize) + + ax[0].set_title('Original') + ax[0].imshow(real_images[i]) + + if self.imageplot_noaxes: + ax[0].set_axis_off() + + ax[1].set_title('Reconstructed') + ax[1].imshow(rec_images[i]) + + if self.imageplot_noaxes: + ax[1].set_axis_off() + + ax[2].set_title('Residual') + ax[2].imshow(residual_images[i]) + + if self.imageplot_noaxes: + ax[2].set_axis_off() + + fig.savefig(path+"/"+name+"_"+str(i)+".png") + plt.close(fig) + #++++++++++++++++++++++ + #**************************** + + # Translate images to a single movie (just a little gimmick to better visualize the data) + #**************************** + def png_to_movie(self,png_path,movie_path,movie_name): + filenames = [] + #++++++++++++++++++++++++++ + for file in os.listdir(png_path): + if ".png" in file: + filenames.append(os.path.join(png_path, file)) + #++++++++++++++++++++++++++ + + filenames = sorted(filenames) + images = [] + shape = None + for filename in filenames: + img = imageio.imread(filename) + if shape == None: + shape = img.shape + + images.append(img) + + imageio.mimsave(os.path.join(movie_path,movie_name+'.gif'),images, duration=self.movie_duration) + #**************************** + + # Put it all together: + #**************************** + def run(self,data_dict): + if self.check_input_data_type(data_dict): + x_real = data_dict[self.real_data_name] + x_rec = data_dict[self.rec_data_name] + + # Compute the residuals first: + residuals = self.compute_residuals(x_real,x_rec) + + # Plot the residuals + self.plot_images(x_real,x_rec,residuals,self.residual_dir,"res") + + # Store everything as a movie, if duration is specified: + if self.movie_duration > 0.0: + self.png_to_movie(self.residual_dir,self.residual_dir,"res_mov") + else: + return None + #**************************** + + # Save and load are not active here: + #**************************** + def save(self): + pass + + def load(self): + pass + #**************************** + diff --git a/jlab_datascience_toolkit/cfgs/.DS_Store b/jlab_datascience_toolkit/cfgs/.DS_Store new file mode 100644 index 0000000..0f1de2f Binary files /dev/null and b/jlab_datascience_toolkit/cfgs/.DS_Store differ diff --git a/jlab_datascience_toolkit/cfgs/defaults/data_reconstruction_cfg.yaml b/jlab_datascience_toolkit/cfgs/defaults/data_reconstruction_cfg.yaml new file mode 100644 index 0000000..106671a --- /dev/null +++ b/jlab_datascience_toolkit/cfgs/defaults/data_reconstruction_cfg.yaml @@ -0,0 +1,7 @@ +output_loc: null +data_store_name: "rec_data" +buffer_size: 100 +n_analysis_samples: 10 +analysis_sample_size: 10 +data_names: ['x_orig','x_rec'] +record_original_data: False diff --git a/jlab_datascience_toolkit/cfgs/defaults/hpo_keras_cnn_ae_cfg.yaml b/jlab_datascience_toolkit/cfgs/defaults/hpo_keras_cnn_ae_cfg.yaml new file mode 100644 index 0000000..9d9c721 --- /dev/null +++ b/jlab_datascience_toolkit/cfgs/defaults/hpo_keras_cnn_ae_cfg.yaml @@ -0,0 +1,51 @@ +# Which model to use / load: +model_id: "KerasCNNAE_v0" +model_cfg_loc: "" +# Model specific settings: +max_pooling: null +kernel_size: 3 +stride: 2 +optimizer: "adam" +conv_kernel_initialization: 'normal' +conv_bias_initialization: 'zeros' +dense_kernel_initialization: 'normal' +dense_bias_initialization: 'zeros' +# HPO specific settings: +n_hpo_trials: 2 +n_epochs_per_trial: 3 +batch_size_per_trial: 128 +validation_split_per_trial: 0.1 +verbosity_per_trial: 0 +hpo_objective_fn: "val_loss" +hpo_objective_direction: "minimize" +hpo_result_folder: "results_hpo_keras_cnn_ae_v0" +hpo_study_name: "study_keras_cnn_ae" +hpo_param_importance: ['latent_dim','n_dense_layers','n_conv_layers','learning_rate','conv_activation','dense_activation'] +# Training of 'final' model: +n_epochs: 5 +batch_size: 128 +validation_split: 0.1 +verbosity: 'auto' +# Tuneabale parameters: +# Conv. architecture: +max_n_conv_layers: 2 +step_n_conv_layers: 1 +max_conv_filters: 50 +min_conv_filters: 20 +step_conv_filters: 10 +# Dense architecture: +max_n_dense_layers: 3 +step_n_dense_layers: 1 +max_dense_units: 30 +min_dense_units: 10 +step_dense_units: 5 +# Latent dim: +max_latent_dim: 50 +min_latent_dim: 10 +step_latent_dim: 5 +# Learning rate: +max_learning_rate: 0.001 +min_learning_rate: 0.000001 +# Activation functions: +conv_activation: ['relu','leaky_relu','tanh'] +dense_activation: ['relu','leaky_relu','tanh'] diff --git a/jlab_datascience_toolkit/cfgs/defaults/image_to_numpy_parser_cfg.yaml b/jlab_datascience_toolkit/cfgs/defaults/image_to_numpy_parser_cfg.yaml new file mode 100644 index 0000000..a36686d --- /dev/null +++ b/jlab_datascience_toolkit/cfgs/defaults/image_to_numpy_parser_cfg.yaml @@ -0,0 +1,5 @@ +image_loc: "" +dtype: "float32" +convert_image_mode: "RGB" +data_store_loc: "" +event_axis: 0 \ No newline at end of file diff --git a/jlab_datascience_toolkit/cfgs/defaults/keras_cnn_ae_cfg.yaml b/jlab_datascience_toolkit/cfgs/defaults/keras_cnn_ae_cfg.yaml new file mode 100644 index 0000000..9f06cda --- /dev/null +++ b/jlab_datascience_toolkit/cfgs/defaults/keras_cnn_ae_cfg.yaml @@ -0,0 +1,36 @@ +precision: "float32" +# Model storage / writing to file: +store_model_loc: null +load_model_loc: null +model_store_format: ".h5" +compile_loaded_model: False +# Model architecture: +image_dimensions: null +max_pooling: null +conv_architecture: [64,32] +conv_activations: ["relu","relu"] +conv_kernel_inits: ["he_normal","he_normal"] +conv_bias_inits: ["zeros","zeros"] +kernel_sizes: [3,3] +strides: [2,2] +dense_architecture: [] +dense_activations: [] +dense_kernel_inits: [] +dense_bias_inits: [] +latent_dim: 10 +latent_space_is_2d: False +decoder_conv_reshape_units: 32 +output_activation: "linear" +output_kernel_size: 3 +output_strides: 1 +# Optimizer and loss function: +learning_rate: 0.0001 +optimizer: "adam" +loss_function: "mse" +# Training the model: +n_epochs: 5 +batch_size: 32 +validation_split: 0.1 +verbosity: "auto" + + diff --git a/jlab_datascience_toolkit/cfgs/defaults/learning_curve_visualizer_cfg.yaml b/jlab_datascience_toolkit/cfgs/defaults/learning_curve_visualizer_cfg.yaml new file mode 100644 index 0000000..6a54ecd --- /dev/null +++ b/jlab_datascience_toolkit/cfgs/defaults/learning_curve_visualizer_cfg.yaml @@ -0,0 +1,9 @@ +plots: {'plot1':['loss','val_loss']} +fig_size: [12,8] +line_width: 3.0 +font_size: 20.0 +leg_font_size: 15.0 +plot_legends: {'plot1':['Training','Validation']} +plot_labels: {'plot1':['Epochs','Loss']} +plot_names: {'plot1':losses} +output_loc: 'results' diff --git a/jlab_datascience_toolkit/cfgs/defaults/mnist_data_parser_cfg.yaml b/jlab_datascience_toolkit/cfgs/defaults/mnist_data_parser_cfg.yaml new file mode 100644 index 0000000..05fa257 --- /dev/null +++ b/jlab_datascience_toolkit/cfgs/defaults/mnist_data_parser_cfg.yaml @@ -0,0 +1,3 @@ +train_data_percentage: 0.5 +validation_data_percentage: 0.5 +use_labels: False \ No newline at end of file diff --git a/jlab_datascience_toolkit/cfgs/defaults/numpy_linear_scaler_cfg.yaml b/jlab_datascience_toolkit/cfgs/defaults/numpy_linear_scaler_cfg.yaml new file mode 100644 index 0000000..2e3db37 --- /dev/null +++ b/jlab_datascience_toolkit/cfgs/defaults/numpy_linear_scaler_cfg.yaml @@ -0,0 +1,7 @@ +'A': 1.0 +'B': 0.0 +data_store_loc: "" +store_loc: "" +run_dtype: "float32" +reverse_dtype: "float32" +exclude_data: [] \ No newline at end of file diff --git a/jlab_datascience_toolkit/cfgs/defaults/residual_analyzer_cfg.yaml b/jlab_datascience_toolkit/cfgs/defaults/residual_analyzer_cfg.yaml new file mode 100644 index 0000000..c1dad9e --- /dev/null +++ b/jlab_datascience_toolkit/cfgs/defaults/residual_analyzer_cfg.yaml @@ -0,0 +1,8 @@ +output_loc: "residual_analysis_results" +reduction_mode: "mean" +reduction_axis: 3 +imageplot_figsize: [18,8] +imageplot_noaxes: True +movie_duration: 1.0 +real_data_name: 'x_real' +rec_data_name: 'x_rec' diff --git a/jlab_datascience_toolkit/core/jdst_model.py b/jlab_datascience_toolkit/core/jdst_model.py index c2dfb87..4d2e934 100644 --- a/jlab_datascience_toolkit/core/jdst_model.py +++ b/jlab_datascience_toolkit/core/jdst_model.py @@ -16,8 +16,8 @@ def train(self): def predict(self): raise NotImplementedError - # Run a small analysis (e.g. determine ROC-Curve, MSE,...) + # Get the model (tf- / pytorch object with layers, weights, activations, etc.) @abstractmethod - def analysis(self): - raise NotImplementedError + def get_model(self): + return NotImplementedError \ No newline at end of file diff --git a/jlab_datascience_toolkit/core/jdst_module.py b/jlab_datascience_toolkit/core/jdst_module.py index 97eee29..a581f43 100644 --- a/jlab_datascience_toolkit/core/jdst_module.py +++ b/jlab_datascience_toolkit/core/jdst_module.py @@ -17,6 +17,13 @@ def __init__(self,**kwargs): def get_info(self): raise NotImplementedError + # Request that every module runs a type check on the input data + # This helps to ensure that we can faster identify if certain modules can not be combined + # e.g. a pytorch module with a tensorflow module, or modules that simply expect different input data types + @abstractmethod + def check_input_data_type(self): + return NotImplementedError + # Load and save configuration files which run the module: @abstractmethod def load_config(self): diff --git a/jlab_datascience_toolkit/data_parser/__init__.py b/jlab_datascience_toolkit/data_parser/__init__.py deleted file mode 100644 index 44d6f0d..0000000 --- a/jlab_datascience_toolkit/data_parser/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules - -register( - id="NumpyParser_v0", - entry_point="jlab_datascience_toolkit.data_parser.numpy_parser:NumpyParser" -) - -from jlab_datascience_toolkit.data_parser.numpy_parser import NumpyParser - -register( - id="PandasParser_v0", - entry_point="jlab_datascience_toolkit.data_parser.pandas_parser_v0:PandasParser" -) \ No newline at end of file diff --git a/jlab_datascience_toolkit/data_parsers/__init__.py b/jlab_datascience_toolkit/data_parsers/__init__.py new file mode 100644 index 0000000..4610b87 --- /dev/null +++ b/jlab_datascience_toolkit/data_parsers/__init__.py @@ -0,0 +1,25 @@ +from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules + +# Numpy Parser: +register( + id="NumpyParser_v0", + entry_point="jlab_datascience_toolkit.data_parsers.numpy_parser:NumpyParser" +) + +# Pandas Parser: +register( + id="PandasParser_v0", + entry_point="jlab_datascience_toolkit.data_parsers.pandas_parser_v0:PandasParser" +) + +# Image to Numpy parser: +register( + id="ImageToNumpyParser_v0", + entry_point="jlab_datascience_toolkit.data_parsers.image_to_numpy_parser:ImageToNumpyParser" +) + +# MNIST Data parser: +register( + id="MNISTDataParser_v0", + entry_point="jlab_datascience_toolkit.data_parsers.mnist_data_parser:MNISTDataParser" +) diff --git a/jlab_datascience_toolkit/data_parsers/image_to_numpy_parser.py b/jlab_datascience_toolkit/data_parsers/image_to_numpy_parser.py new file mode 100644 index 0000000..2dfff6c --- /dev/null +++ b/jlab_datascience_toolkit/data_parsers/image_to_numpy_parser.py @@ -0,0 +1,143 @@ +from jlab_datascience_toolkit.core.jdst_data_parser import JDSTDataParser +from PIL import Image +import os +import numpy as np +import logging +import inspect +import yaml + +class ImageToNumpyParser(JDSTDataParser): + + """Image to Numpy data parser that reads in strings of file paths to images and returns a single .npy file + + What this module does: + "i) Read in multiple .png files that are specified in a list of strings + ii) Combine single .npy files into one + + Input(s): + i) Full path to .yaml configuration file + ii) Optional: User configuration, i.e. a python dict with additonal / alternative settings + + Output(s): + i) Single .npy file + """ + + # Initialize: + #********************************************* + def __init__(self,path_to_cfg,user_config={}): + # Set the name specific to this module: + self.module_name = "image_to_numpy_parser" + + # Load the configuration: + self.config = self.load_config(path_to_cfg,user_config) + + # Save this config, if a path is provided: + if 'store_cfg_loc' in self.config: + self.save_config(self.config['store_cfg_loc']) + + # Run sanity check(s): + # i) Make sure that the provide data path(s) are list objects: + if isinstance(self.config['image_loc'],list) == False: + logging.error(">>> " + self.module_name +": The data path(s) must be a list object, e.g. data_loc: [path1,path2,...] <<<") + #********************************************* + + # Check the input data type --> This module expects a list of strings / file paths: + #********************************************* + def check_input_data_type(self,input_data): + if isinstance(input_data,list) == False: + logging.error(f">>> {self.module_name}: The input data type {type(input_data)} is not a list. Please correct. Going to returne None <<<") + return False + else: + if len(input_data) > 0: + return True + else: + logging.error(f">>> {self.module_name}: The list of filepaths your provided {input_data} seems to be empty. Please check your configuration. Going to return None <<<") + return False + #********************************************* + + # Provide information about this module: + #********************************************* + def get_info(self): + print(inspect.getdoc(self)) + #********************************************* + + # Handle configurations: + #********************************************* + # Load the config: + def load_config(self,path_to_cfg,user_config): + with open(path_to_cfg, 'r') as file: + cfg = yaml.safe_load(file) + + # Overwrite config with user settings, if provided + try: + if bool(user_config): + #++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + #++++++++++++++++++++++++ + except: + logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + + return cfg + + #----------------------------- + + # Store the config: + def save_config(self,path_to_config): + with open(path_to_config, 'w') as file: + yaml.dump(self.config, file) + #********************************************* + + # Load images: + #********************************************* + # Load a single image: + def load_single_image(self,path): + try: + img = Image.open(path) + if self.config['convert_image_mode'] is not None: + img = img.convert(self.config['convert_image_mode']) + + data = np.array(img).astype(self.config['dtype']) + return data + except: + logging.exception(f">>> " + self.module_name + ": File {path} does not exist <<<") + + #----------------------------- + + # Now load multiple files: + def load_data(self): + + if self.check_input_data_type(self.config['image_loc']) == True: + + collected_data = [] + #+++++++++++++++++++++ + for path in self.config['image_loc']: + collected_data.append(np.expand_dims(self.load_single_image(path),axis=self.config['event_axis'])) + #+++++++++++++++++++++ + + return np.concatenate(collected_data,axis=self.config['event_axis']) + + return None + #********************************************* + + # Save the data: + #********************************************* + def save_data(self,data): + try: + os.makedirs(self.config['data_store_loc'],exist_ok=True) + np.save(self.config['data_store_loc'],data) + except: + logging.exception(">>> " + self.module_name + ": Please provide a valid name for storing the data in .npy format. <<<") + #********************************************* + + # Module checkpointing: Not implemented yet and maybe not + # necessary, as we leave these functions blank for now + #********************************************* + def load(self): + return 0 + + #----------------------------- + + def save(self): + return 0 + #********************************************* \ No newline at end of file diff --git a/jlab_datascience_toolkit/data_parsers/mnist_data_parser.py b/jlab_datascience_toolkit/data_parsers/mnist_data_parser.py new file mode 100644 index 0000000..c0bcacf --- /dev/null +++ b/jlab_datascience_toolkit/data_parsers/mnist_data_parser.py @@ -0,0 +1,128 @@ +from jlab_datascience_toolkit.core.jdst_data_parser import JDSTDataParser +from jlab_datascience_toolkit.utils.get_mnist import get_mnist_data +from PIL import Image +import os +import numpy as np +import logging +import inspect +import yaml +from sklearn.utils import shuffle + +class MNISTDataParser(JDSTDataParser): + ''' + Dummy data parser that does not require any specific inputs (e.g. paths or data files) and simply returns the MNIST data (without labels!) + ''' + + # Initialize: + #********************************************* + def __init__(self,path_to_cfg,user_config={}): + # Set the name specific to this module: + self.module_name = "image_to_numpy_parser" + + # Load the configuration: + self.config = self.load_config(path_to_cfg,user_config) + + # Save this config, if a path is provided: + if 'store_cfg_loc' in self.config: + self.save_config(self.config['store_cfg_loc']) + + self.train_data_percentage = self.config['train_data_percentage'] + self.validation_data_percentage = self.config['validation_data_percentage'] + self.use_labels = self.config['use_labels'] + + # Keep track of the labels: + self.mnist_labels = None + #********************************************* + + # Check input data which is not necessary here, as this module does not require any: + #********************************************* + def check_input_data_type(self): + pass + #********************************************* + + # Provide information about this module: + #********************************************* + def get_info(self): + print(inspect.getdoc(self)) + #********************************************* + + # Handle configurations: + #********************************************* + # Load the config: + def load_config(self,path_to_cfg,user_config): + with open(path_to_cfg, 'r') as file: + cfg = yaml.safe_load(file) + + # Overwrite config with user settings, if provided + try: + if bool(user_config): + #++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + #++++++++++++++++++++++++ + except: + logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + + return cfg + + #----------------------------- + + # Store the config: + def save_config(self,path_to_config): + with open(path_to_config, 'w') as file: + yaml.dump(self.config, file) + #********************************************* + + # Now load multiple files: + def load_data(self): + # Get the mnist data + x_train,y_train,x_val,y_val = get_mnist_data() + + # Select specific portions: + n_train = int(self.train_data_percentage*x_train.shape[0]) + n_val = int(self.validation_data_percentage*x_val.shape[0]) + + idx_train = np.random.choice(x_train.shape[0],n_train) + idx_val = np.random.choice(x_val.shape[0],n_val) + + # And combine them to new data: + new_data = np.concatenate([ + x_train[idx_train], + x_val[idx_val] + ],axis=0) + + # Use labels if requested by the user: + if self.use_labels == True: + logging.info(">>> " + self.module_name + ": Using MNIST labels as well <<<") + new_labels = np.concatenate([ + y_train[idx_train], + y_val[idx_val] + ],axis=0) + + new_data, self.mnist_labels = shuffle(new_data,new_labels) + return new_data + + return shuffle(new_data) + #********************************************* + + # Save the data: + #********************************************* + def save_data(self,data): + try: + os.makedirs(self.config['data_store_loc'],exist_ok=True) + np.save(self.config['data_store_loc'],data) + except: + logging.exception(">>> " + self.module_name + ": Please provide a valid name for storing the data in .npy format. <<<") + #********************************************* + + # Module checkpointing: Not implemented yet and maybe not + # necessary, as we leave these functions blank for now + #********************************************* + def load(self): + return 0 + + #----------------------------- + + def save(self): + return 0 + #********************************************* \ No newline at end of file diff --git a/jlab_datascience_toolkit/data_parser/numpy_parser.py b/jlab_datascience_toolkit/data_parsers/numpy_parser.py similarity index 81% rename from jlab_datascience_toolkit/data_parser/numpy_parser.py rename to jlab_datascience_toolkit/data_parsers/numpy_parser.py index 9a0f718..bd930d8 100644 --- a/jlab_datascience_toolkit/data_parser/numpy_parser.py +++ b/jlab_datascience_toolkit/data_parsers/numpy_parser.py @@ -3,6 +3,7 @@ import yaml import logging import inspect +import os class NumpyParser(JDSTDataParser): """Numpy data parser that reads in strings of file paths and returns a single .npy file @@ -38,6 +39,20 @@ def __init__(self,path_to_cfg,user_config={}): logging.error(">>> " + self.module_name +": The data path(s) must be a list object, e.g. data_loc: [path1,path2,...] <<<") #********************************************* + # Check the input data type --> This module expects a list of strings / file paths: + #********************************************* + def check_input_data_type(self,input_data): + if isinstance(input_data,list) == False: + logging.error(f">>> {self.module_name}: The input data type {type(input_data)} is not a list. Please correct. Going to returne None <<<") + return False + else: + if len(input_data) > 0: + return True + else: + logging.error(f">>> {self.module_name}: The list of filepaths your provided {input_data} seems to be empty. Please check your configuration. Going to return None <<<") + return False + #********************************************* + # Provide information about this module: #********************************************* def get_info(self): @@ -84,7 +99,9 @@ def load_single_file(self,path_to_file): # Load multiple files which represent the final data: def load_data(self): - try: + + if self.check_input_data_type(self.config['data_loc']) == True: + collected_data = [] #+++++++++++++++++++++ for path in self.config['data_loc']: @@ -92,8 +109,8 @@ def load_data(self): #+++++++++++++++++++++ return np.concatenate(collected_data,axis=self.config['event_axis']) - except: - logging.exception(">>> " + self.module_name + ": Please check the provided data path which must be a list. <<<") + + return None #********************************************* # Save the data: @@ -107,7 +124,7 @@ def save_data(self,data): #********************************************* # Module checkpointing: Not implemented yet and maybe not - # necessary, ao we leave these functions blank for now + # necessary, as we leave these functions blank for now #********************************************* def load(self): return 0 diff --git a/jlab_datascience_toolkit/data_parser/pandas_parser_v0.py b/jlab_datascience_toolkit/data_parsers/pandas_parser_v0.py similarity index 80% rename from jlab_datascience_toolkit/data_parser/pandas_parser_v0.py rename to jlab_datascience_toolkit/data_parsers/pandas_parser_v0.py index a90976a..4e703f2 100644 --- a/jlab_datascience_toolkit/data_parser/pandas_parser_v0.py +++ b/jlab_datascience_toolkit/data_parsers/pandas_parser_v0.py @@ -63,6 +63,8 @@ def __init__(self, config: dict = None): # It is important not to use default mutable arguments in python # (lists/dictionaries), so we set config to None and update later + self.module_name = "pandas_parser" + # Set default config self.config = dict( filepaths=[], @@ -94,6 +96,17 @@ def setup(self): f'File format {self.config["file_format"]}' 'is not currently supported.') raise ValueError + + def check_input_data_type(self,input_data): + if isinstance(input_data,list) == False: + logging.error(f">>> {self.name}: The input data type {type(input_data)} is not a list. Please correct. Going to returne None <<<") + return False + else: + if len(input_data) > 0: + return True + else: + logging.error(f">>> {self.name}: The list of filepaths your provided {input_data} seems to be empty. Please check your configuration. Going to return None <<<") + return False def get_info(self): """ Prints the docstring for the PandasParser module""" @@ -131,27 +144,23 @@ def load_data(self) -> pd.DataFrame: Returns: pd.DataFrame: A single DataFrame containing concatenated data """ - data_list = [] - for file in self.config['filepaths']: - pandas_parser_log.debug(f'Loading {file} ...') - data = self.read_function( - file, - **self.config['read_kwargs']) - data_list.append(data) - - # Check for empty data and return nothing if empty - if not data_list: - pandas_parser_log.warning( - 'load_data() returning None. This is probably not what you ' - 'wanted. Ensure that your configuration includes the key ' - '"filepaths"') - return - - output = pd.concat( + + if self.check_input_data_type(self.config['filepaths']) == True: + + data_list = [] + for file in self.config['filepaths']: + pandas_parser_log.debug(f'Loading {file} ...') + data = self.read_function( + file, + **self.config['read_kwargs']) + data_list.append(data) + + return pd.concat( data_list, **self.config['concat_kwargs']) - return output + + return None def load_config(self, path: str): pandas_parser_log.debug('Calling load()...') diff --git a/jlab_datascience_toolkit/data_prep/__init__.py b/jlab_datascience_toolkit/data_prep/__init__.py deleted file mode 100644 index 0f8b07c..0000000 --- a/jlab_datascience_toolkit/data_prep/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules - -register( - id="NumpyMinMaxScaler_v0", - entry_point="jlab_datascience_toolkit.data_prep.numpy_minmax_scaler:NumpyMinMaxScaler" -) - -from jlab_datascience_toolkit.data_prep.numpy_minmax_scaler import NumpyMinMaxScaler - diff --git a/jlab_datascience_toolkit/data_preps/__init__.py b/jlab_datascience_toolkit/data_preps/__init__.py new file mode 100644 index 0000000..cc6811e --- /dev/null +++ b/jlab_datascience_toolkit/data_preps/__init__.py @@ -0,0 +1,13 @@ +from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules + +# Min Max Scaler: +register( + id="NumpyMinMaxScaler_v0", + entry_point="jlab_datascience_toolkit.data_preps.numpy_minmax_scaler:NumpyMinMaxScaler" +) + +# Numpy Linear Scaler: +register( + id="NumpyLinearScaler_v0", + entry_point="jlab_datascience_toolkit.data_preps.numpy_linear_scaler:NumpyLinearScaler" +) diff --git a/jlab_datascience_toolkit/data_preps/numpy_linear_scaler.py b/jlab_datascience_toolkit/data_preps/numpy_linear_scaler.py new file mode 100644 index 0000000..8adac64 --- /dev/null +++ b/jlab_datascience_toolkit/data_preps/numpy_linear_scaler.py @@ -0,0 +1,169 @@ +from jlab_datascience_toolkit.core.jdst_data_prep import JDSTDataPrep +import numpy as np +import yaml +import inspect +import logging +import os + +class NumpyLinearScaler(JDSTDataPrep): + """Simplified linear scaler + + What this module does: + "i) Apply the transformation: A * X + B, where A, B are constants and X is either a numpy array / image or a dictionary containing numpy arrays / images + + Input(s): + i) Scale A + ii) Offset B + iii) dtype (int,float,etc.) for the scaled data + iv) dtype (int,float,etc.) for the reverse scaled data + + Output(s): + i) Scaled image or dict with scaled images + """ + + # Initialize: + #********************************************* + def __init__(self,path_to_cfg,user_config={}): + # Set the name specific to this module: + self.module_name = "numpy_linear_scaler" + + # Load the configuration: + self.config = self.load_config(path_to_cfg,user_config) + + # Check for data that shall be excluded from this module: + # Note: This only works if the input data is a dictionary + self.exclude_data = self.config['exclude_data'] + + # Save this config, if a path is provided: + if 'store_cfg_loc' in self.config: + self.save_config(self.config['store_cfg_loc']) + #********************************************* + + # Run type check on the input data: + #********************************************* + def check_input_data_type(self,data): + if isinstance(data,np.ndarray) == True: + return "numpy" + + elif isinstance(data,dict) == True: + # Make sure that every element in the dictionary is a numpy array: + pass_type_check = True + #+++++++++++++++++ + for key in data: + if isinstance(data[key],np.ndarray) == False and key not in self.exclude_data: + pass_type_check = False + #+++++++++++++++++ + + if pass_type_check: + return "dict" + + logging.error(">>> " + self.module_name + ": Dictionary does not contain numpy data<<<") + return "no_implemented" + else: + logging.error(f">>> {self.module_name}: Data type {type(data)} is neither a numpy array nor dictionary with numpy data<<<") + return "no_implemented" + #********************************************* + + # Provide information about this module: + #********************************************* + def get_info(self): + print(inspect.getdoc(self)) + #********************************************* + + # Handle configurations: + #********************************************* + # Load the config: + def load_config(self,path_to_cfg,user_config): + with open(path_to_cfg, 'r') as file: + cfg = yaml.safe_load(file) + + # Overwrite config with user settings, if provided + try: + if bool(user_config): + #++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + #++++++++++++++++++++++++ + except: + logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + + return cfg + + #----------------------------- + + # Store the config: + def save_config(self,path_to_config): + with open(path_to_config, 'w') as file: + yaml.dump(self.config, file) + #********************************************* + + + + # Run and reverse the scaling: + #********************************************* + # Scale: + def run(self,data): + if self.check_input_data_type(data).lower() == "numpy": + return (data * self.config['A'] + self.config['B']).astype(self.config['run_dtype']) + elif self.check_input_data_type(data).lower() == "dict": + result_dict = {} + #+++++++++++++++++++ + for key in data: + if key not in self.exclude_data: + result_dict[key] = (data[key] * self.config['A'] + self.config['B']).astype(self.config['run_dtype']) + #+++++++++++++++++++ + + return result_dict + + return None + #----------------------------- + + # Undo scaling: + def reverse(self,data): + if self.check_input_data_type(data).lower() == "numpy": + reversed_data = (data - self.config['B']) / self.config['A'] + return reversed_data.astype(self.config['reverse_dtype']) + elif self.check_input_data_type(data).lower() == "dict": + result_dict = {} + #+++++++++++++++++++ + for key in data: + if key not in self.exclude_data: + reversed_data = (data[key] - self.config['B']) / self.config['A'] + result_dict[key] = reversed_data.astype(self.config['reverse_dtype']) + #+++++++++++++++++++ + + return result_dict + + return None + #********************************************* + + # Save the data: + #********************************************* + def save_data(self,data): + try: + os.makedirs(self.config['data_store_loc'],exist_ok=True) + np.save(self.config['data_store_loc'],data) + except: + logging.exception(">>> " + self.module_name + ": Please provide a valid name for storing the transformed .npy data <<<") + #********************************************* + + # Module checkpointing: Save and load parameters that are important to this scaler: + #********************************************* + def load(self): + store_name = self.config['store_loc'] + A = np.load(store_name+"/numpy_linear_scaler_A.npy") + B = np.load(store_name+"/numpy_linear_scaler_B.npy") + return { + 'A':A, + 'B':B + } + + #----------------------------- + + def save(self): + store_name = self.config['store_loc'] + os.makedirs(store_name,exist_ok=True) + + np.save(store_name+"/numpy_linear_scaler_A.npy",self.config['A']) + np.save(store_name+"/numpy_linear_scaler_B.npy",self.config['B']) + #********************************************* diff --git a/jlab_datascience_toolkit/data_prep/numpy_minmax_scaler.py b/jlab_datascience_toolkit/data_preps/numpy_minmax_scaler.py similarity index 80% rename from jlab_datascience_toolkit/data_prep/numpy_minmax_scaler.py rename to jlab_datascience_toolkit/data_preps/numpy_minmax_scaler.py index f2995d9..f3f0a18 100644 --- a/jlab_datascience_toolkit/data_prep/numpy_minmax_scaler.py +++ b/jlab_datascience_toolkit/data_preps/numpy_minmax_scaler.py @@ -3,6 +3,7 @@ from sklearn.preprocessing import MinMaxScaler import logging import yaml +import inspect import os class NumpyMinMaxScaler(JDSTDataPrep): @@ -27,25 +28,19 @@ def __init__(self,path_to_cfg,user_config={}): logging.exception(">>> " + self.module_name + f": Invalid feature range: {self.config['feature_range']}. Must provide a tuple. <<<") #********************************************* + # Run type check on the input data: + #********************************************* + def check_input_data_type(self,data): + if isinstance(data,np.ndarray) == False: + logging.error(f">>> {self.module_name}: Provided data type {type(data)} is not a numpy array. Please check your workflow. Going to return None <<<") + return False + else: + return True + # Provide information about this module: #********************************************* def get_info(self): - print(" ") - print("*** Info: NumpyMinMaxScaler ***") - print("Input(s):") - print("i) Full path to .yaml configuration file ") - print("ii) Optional: User configuration, i.e. a python dict with additonal / alternative settings") - print("iii) Numpy data") - print("What this module does:") - print("i) Scale input data with respect to a specified range") - print("ii) Optional: reverse the scaling") - print("Output(s):") - print("i) Scaled .npy data") - print("ii) Optional: unscaled .npy data") - print("Note(s):") - print("i) The scaler will (by default) be fitted to the data and the transform it. To disable the fitting, do: run(data,disable_fit=True)") - print("*** Info: NumpyMinMaxScaler ***") - print(" ") + print(inspect.getdoc(self)) #********************************************* # Handle configurations: @@ -74,25 +69,14 @@ def save_config(self,path_to_config): with open(path_to_config, 'w') as file: yaml.dump(self.config, file) #********************************************* - - # Run a type chec: - #********************************************* - def type_check(self,data): - if isinstance(data,np.ndarray) == False: - logging.error(">>> " + self.module_name + ": Data is not a numpy array <<<") - return False - - return True - #********************************************* - - + # Run and reverse the scaling: #********************************************* # Scale: def run(self,data,disable_fit=False): # Check if the data-type is a numpy array: - if self.type_check(data): + if self.check_input_data_type(data): # Do not re-calibrate the scaler, if a fit has already been done: if disable_fit == True: @@ -100,13 +84,17 @@ def run(self,data,disable_fit=False): return self.scaler.fit_transform(data) + return None + #----------------------------- # Undo the scaling: def reverse(self,data): # Run a type check: - if self.type_check(data): + if self.check_input_data_type(data): return self.scaler.inverse_transform(data) + + return None #********************************************* # Save the data: diff --git a/jlab_datascience_toolkit/driver/image_anomaly_detection.py b/jlab_datascience_toolkit/driver/image_anomaly_detection.py new file mode 100644 index 0000000..3161145 --- /dev/null +++ b/jlab_datascience_toolkit/driver/image_anomaly_detection.py @@ -0,0 +1,67 @@ +import os +from jlab_datascience_toolkit.utils.graph_driver_utils import GraphRuntime +import numpy as np + +modules = { + 'data_parser':'MNISTDataParser_v0', + 'data_scaler':'NumpyLinearScaler_v0', + 'anomaly_detector':'KerasCNNAE_v0', + 'loss_visualizer':'LearningCurveVisualizer_v0', + 'data_reconstruction':'DataReconstruction_v0', + 'residual_analysis':'ResidualAnalyzer_v0' +} + +graph = [ + (None,'data_parser.load_data','mnist_data'), + ('mnist_data','data_scaler.run','scaled_data'), + ('scaled_data','anomaly_detector.train','training_history'), + (None,'anomaly_detector.get_model','model'), + ('training_history','loss_visualizer.run',None), + (('scaled_data','model'),'data_reconstruction.run','rec_data'), + ('rec_data','data_scaler.reverse','unscaled_data'), + ('unscaled_data','residual_analysis.run','residuals') +] + +this_file_loc = os.path.dirname(__file__) +cfg_locs = { + 'data_parser':os.path.join(this_file_loc,'../cfgs/defaults/mnist_data_parser_cfg.yaml'), + 'data_scaler':os.path.join(this_file_loc,'../cfgs/defaults/numpy_linear_scaler_cfg.yaml'), + 'anomaly_detector':os.path.join(this_file_loc,'../cfgs/defaults/keras_cnn_ae_cfg.yaml'), + 'loss_visualizer':os.path.join(this_file_loc,'../cfgs/defaults/learning_curve_visualizer_cfg.yaml'), + 'data_reconstruction':os.path.join(this_file_loc,'../cfgs/defaults/data_reconstruction_cfg.yaml'), + 'residual_analysis':os.path.join(this_file_loc,'../cfgs/defaults/residual_analyzer_cfg.yaml') +} + +# Set the result location: +result_loc = 'anomaly_analysis_results_v0' + +user_cfgs = { + 'data_parser':{}, + 'data_scaler':{'A':1.0/255.0}, + 'anomaly_detector':{ + 'store_model_loc':result_loc, + 'image_dimensions':(28,28,1), + 'n_epochs': 50, + 'dense_architecture':[10,10], + 'dense_activations':['relu']*2, + 'dense_kernel_inits':['he_normal']*2, + 'dense_bias_inits':['he_normal']*2, + 'latent_space_is_2d':False, + 'optimizer':'legacy_adam', + 'early_stopping_monitor':'val_loss', + 'early_stopping_min_delta':0.00005, + 'early_stopping_patience':5, + 'early_stopping_restore_best_weights':True, + }, + 'loss_visualizer':{ + 'output_loc':result_loc + }, + 'data_reconstruction':{}, + 'residual_analysis':{ + 'output_loc':result_loc, + 'real_data_name': 'x_orig' + } +} + +gr = GraphRuntime() +results, module_dict = gr.run_graph(graph, modules,cfg_locs,user_cfgs) diff --git a/jlab_datascience_toolkit/hyper_parameter_tuning/hpo_keras_cnn_ae.py b/jlab_datascience_toolkit/hyper_parameter_tuning/hpo_keras_cnn_ae.py new file mode 100644 index 0000000..5853b89 --- /dev/null +++ b/jlab_datascience_toolkit/hyper_parameter_tuning/hpo_keras_cnn_ae.py @@ -0,0 +1,393 @@ +from jlab_datascience_toolkit.core.jdst_model import JDSTModel +import jlab_datascience_toolkit.models as models +import tensorflow as tf +import numpy as np +import matplotlib.pyplot as plt +import inspect +import optuna +import yaml +import logging +import os +import gc + +class HPOKerasCNNAE(JDSTModel): + ''' + Hyper parameter optimization class for a keras CNN AE. + ''' + + # Initialize: + #**************************** + def __init__(self,path_to_cfg,user_config={}): + # Set the name specific to this module: + self.module_name = "keras_cnn_ae" + + # Load the configuration: + self.config = self.load_config(path_to_cfg,user_config) + + # Basic settings: + + # ID of the CNN AE model we wish to tune: + self.model_id = self.config['model_id'] + # Get the model configuration: + self.model_cfg_loc = self.config['model_cfg_loc'] + # If there is no location for a model configuration specified, be just go back and load the default one: + if self.model_cfg_loc == "" or self.model_cfg_loc is None: + this_file_loc = os.path.dirname(__file__) + self.model_cfg_loc = os.path.join(this_file_loc,'../cfgs/defaults/keras_cnn_ae_cfg.yaml') + + # Load the config: + with open(self.model_cfg_loc, 'r') as cfg: + self.model_config = yaml.safe_load(cfg) + + # Get the iamge dimensions right: + self.model_config['image_dimensions'] = self.config['image_dimensions'] + # Add max pooling: + self.model_config['max_pooling'] = self.config['max_pooling'] + # Optimizer: + self.model_config['optimizer'] = self.config['optimizer'] + + + # Kernel size and strides: + self.kernel_size = self.config['kernel_size'] + self.stride = self.config['stride'] + + # HPO specific settings: + self.n_hpo_trials = self.config['n_hpo_trials'] + self.n_epochs_per_trial = self.config['n_epochs_per_trial'] + self.batch_size_per_trial = self.config['batch_size_per_trial'] + self.validation_split_per_trial = self.config['validation_split_per_trial'] + self.verbosity_per_trial = self.config['verbosity_per_trial'] + self.hpo_objective_fn = self.config['hpo_objective_fn'] + self.hpo_objective_direction = self.config['hpo_objective_direction'] + self.hpo_result_folder = self.config['hpo_result_folder'] + self.hpo_study_name = self.config['hpo_study_name'] + self.hpo_param_importance = self.config['hpo_param_importance'] + + # Training of the final model: + self.n_epochs = self.config['n_epochs'] + self.batch_size = self.config['batch_size'] + self.validation_split = self.config['validation_split'] + self.verbosity = self.config['verbosity'] + + # Weight and bias initialization: + self.conv_kernel_initialization = self.config['conv_kernel_initialization'] + self.conv_bias_initialization = self.config['conv_bias_initialization'] + self.dense_kernel_initialization = self.config['dense_kernel_initialization'] + self.dense_bias_initialization = self.config['dense_bias_initialization'] + + + # Tuneabale parameters: + + # Conv. Architecture: + self.max_n_conv_layers = self.config['max_n_conv_layers'] + self.step_n_conv_layers = self.config['step_n_conv_layers'] + self.min_conv_filters = self.config['min_conv_filters'] + self.max_conv_filters = self.config['max_conv_filters'] + self.step_conv_filters = self.config['step_conv_filters'] + # Dense Architecture: + self.max_n_dense_layers = self.config['max_n_dense_layers'] + self.step_n_dense_layers = self.config['step_n_dense_layers'] + self.min_dense_units = self.config['min_dense_units'] + self.max_dense_units = self.config['max_dense_units'] + self.step_dense_units = self.config['step_dense_units'] + # Latent space: + self.min_latent_dim = self.config['min_latent_dim'] + self.max_latent_dim = self.config['max_latent_dim'] + self.step_latent_dim = self.config['step_latent_dim'] + # Learning rate: + self.max_learning_rate = self.config['max_learning_rate'] + self.min_learning_rate = self.config['min_learning_rate'] + # Activation functions: + self.conv_activation = self.config['conv_activation'] + self.dense_activation = self.config['dense_activation'] + + # Set up the optimizer: + # Collect results: + os.makedirs(self.hpo_result_folder,exist_ok=True) + # Write this config to file: + self.save_config(self.hpo_result_folder+"/hpo_configuration.yaml") + + # Preparation for objective scan: + self.score = 1E99 + self.hpo_data = None + self.model = None + self.maximize_objective = False + if self.hpo_objective_direction.lower() == "maximize": + self.score = -1E99 + self.maximize_objective = True + #**************************** + + # Pass on the data type check for now, as tensorflow allows a variety of data types + # see here: https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit + #**************************** + def check_input_data_type(self): + pass + #**************************** + + # Provide information about this module: + #********************************************* + def get_info(self): + print(inspect.getdoc(self)) + #********************************************* + + # Handle configurations: + #********************************************* + # Load the config: + def load_config(self,path_to_cfg,user_config): + with open(path_to_cfg, 'r') as file: + cfg = yaml.safe_load(file) + + # Overwrite config with user settings, if provided + try: + if bool(user_config): + #++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + #++++++++++++++++++++++++ + except: + logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + + return cfg + + #----------------------------- + + # Store the config: + def save_config(self,path_to_config): + with open(path_to_config, 'w') as file: + yaml.dump(self.config, file) + #********************************************* + + # Probe the hyper parameter space --> We overwrite the original model configuration and then load the new model + #********************************************* + # Adjust weight / bias intitialization: + def initialize_layers(self,activations,kernel_init_str,bias_init_str): + kernel_inits = [] + bias_inits = [] + + if len(activations) > 0: + #++++++++++++++++++++ + for act in activations: + if act.lower() == "relu" or act.lower() == "leaky_relu": + kernel_inits.append("he_"+kernel_init_str) + elif act.lower() == "selu": + kernel_inits.append("lecun_"+kernel_init_str) + else: + kernel_inits.append("glorot_"+kernel_init_str) + + bias_inits.append(bias_init_str) + #++++++++++++++++++++ + + return kernel_inits, bias_inits + + #----------------------------- + + # Set hyper parameters for a give trial: + def probe_hp_space(self,trial): + # Record the trial number: + current_trial = trial.number + + # Copy original model config file, just to be on the safe side: + current_model_cfg = self.model_config.copy() + + # Update hyper parameters + current_model_cfg['trial'] = current_trial + + # Conv. architecture: + if self.step_n_conv_layers > 0 and self.max_n_conv_layers > 1: + n_conv_layers = trial.suggest_int("n_conv_layers",1,self.max_n_conv_layers,step=self.step_n_conv_layers) + + if self.max_conv_filters > 1 and self.min_conv_filters > 0 and self.step_conv_filters > 0: + new_conv_architecture = [] + n_filters_prev = 0 + #+++++++++++++++++++++++++ + for k in range(n_conv_layers): + n_filters = trial.suggest_int(f'n_filters_layer{k}',n_filters_prev+self.min_conv_filters,n_filters_prev+self.max_conv_filters,step=self.step_conv_filters) + new_conv_architecture.append(n_filters) + + n_filters_prev = n_filters + #+++++++++++++++++++++++++ + current_model_cfg['conv_architecture'] = new_conv_architecture + current_model_cfg['kernel_sizes'] = [self.kernel_size] * len(current_model_cfg['conv_architecture']) + current_model_cfg['strides'] = [self.stride] * len(current_model_cfg['conv_architecture']) + + # Dense architecture: + if self.step_n_dense_layers > 0 and self.max_n_dense_layers > 1: + n_dense_layers = trial.suggest_int("n_dense_layers",1,self.max_n_dense_layers,step=self.step_n_dense_layers) + + if self.max_dense_units > 1 and self.min_dense_units > 0 and self.step_dense_units > 0: + new_dense_architecture = [] + n_units_prev = 0 + #+++++++++++++++++++++++++ + for d in range(n_dense_layers): + n_units = trial.suggest_int(f'n_units_layer{d}',n_units_prev+self.min_dense_units,n_units_prev+self.max_dense_units,step=self.step_dense_units) + new_dense_architecture.append(n_units) + + n_units_prev = n_units + #+++++++++++++++++++++++++ + current_model_cfg['dense_architecture'] = new_dense_architecture + # Need to update the number of dense layers: + n_dense_layers = len(current_model_cfg['dense_architecture']) + + + # Activations: + if len(self.conv_activation) > 1: + conv_act = trial.suggest_categorical("conv_activation",self.conv_activation) + current_model_cfg['conv_activations'] = [conv_act] * len(current_model_cfg['conv_architecture']) + elif len(self.conv_activation) == 1: + current_model_cfg['conv_activations'] = self.conv_activation * len(current_model_cfg['conv_architecture']) + + if len(self.dense_activation) > 1 and n_dense_layers > 0: + dense_act = trial.suggest_categorical("dense_activation",self.conv_activation) + current_model_cfg['dense_activations'] = [dense_act] * len(current_model_cfg['dense_architecture']) + elif len(self.dense_activation) == 1: + current_model_cfg['dense_activations'] = self.dense_activation * len(current_model_cfg['dense_architecture']) + + # Weight and bias initialization: + conv_kernel_init, conv_bias_init = self.initialize_layers(current_model_cfg['conv_activations'],self.conv_kernel_initialization,self.conv_bias_initialization) + dense_kernel_init, dense_bias_init = self.initialize_layers(current_model_cfg['dense_activations'],self.dense_kernel_initialization,self.dense_bias_initialization) + + current_model_cfg['conv_kernel_inits'] = conv_kernel_init + current_model_cfg['conv_bias_inits'] = conv_bias_init + current_model_cfg['dense_kernel_inits'] = dense_kernel_init + current_model_cfg['dense_bias_inits'] = dense_bias_init + + # Latene space: + if self.min_latent_dim > 0 and self.max_latent_dim > 1 and self.step_latent_dim > 0: + current_model_cfg['latent_dim'] = trial.suggest_int('latent_dim',self.min_latent_dim,self.max_latent_dim,step=self.step_latent_dim) + + # Learning rate: + if self.min_learning_rate > 0.0 and self.max_learning_rate > self.min_learning_rate: + current_model_cfg['learning_rate'] = trial.suggest_float("learning_rate",self.min_learning_rate,self.max_learning_rate,log=True) + + return current_model_cfg + #********************************************* + + # Objective: + #********************************************* + def objective(self,trial): + # Clear the memory: + tf.keras.backend.clear_session() + + # Probe hyper paramteres and get model settings: + current_settings = self.probe_hp_space(trial) + + # Create a new model: + current_model = models.make(self.model_id,path_to_cfg=self.model_cfg_loc,user_config=current_settings) + + # Train the model for a bit, to extract the objective: + current_results = current_model.train( + x=self.hpo_data, + n_epochs=self.n_epochs_per_trial, + batch_size=self.batch_size_per_trial, + validation_split=self.validation_split_per_trial, + verbosity=self.verbosity_per_trial + ) + + # Retreive objective: + objective_score = None + if self.hpo_objective_fn in current_results: + objective_score = current_results[self.hpo_objective_fn][-1] + else: + objective_score = current_results['loss'][-1] + + # Make sure we do not collect garbage + gc.collect() + + # Update the objective score: + if np.isnan(objective_score) or np.isposinf(objective_score): + objective_score = 1E99 + + if np.isneginf(objective_score): + objective_score = -1E99 + + # Compare the current performance to the previous one and then select the 'best' model: + if self.maximize_objective == True: + if objective_score > self.score: + self.model = current_model + else: + if objective_score < self.score: + self.model = current_model + + del current_model + del current_settings + + return objective_score + #********************************************* + + # Visualize HP performance: + #********************************************* + def visualize_search(self,optuna_study): + # Optimization history: + optuna.visualization.matplotlib.plot_optimization_history(optuna_study) + plt.gcf().set_size_inches(20,7) + plt.savefig(self.hpo_result_folder+"/optimization_history.png") + plt.close() + # Parameter Importance: + optuna.visualization.matplotlib.plot_param_importances(optuna_study) + plt.gcf().set_size_inches(15,7) + plt.savefig(self.hpo_result_folder+"/hp_importance.png") + plt.close() + # Parallel plot: + optuna.visualization.matplotlib.plot_parallel_coordinate(optuna_study, params=self.hpo_param_importance) + plt.gcf().set_size_inches(15,7) + plt.savefig(self.hpo_result_folder+"/hp_parallel.png") + plt.close() + #********************************************* + + # Get the predicion: + #********************************************* + def predict(self,x,to_numpy=True): + return self.model.predict(x,to_numpy) + #********************************************* + + # Run the HP search and the final training of the best model: + #********************************************* + def train(self,x): + # Define a study: + study = optuna.create_study(direction=self.hpo_objective_direction,study_name=self.hpo_study_name) + + self.hpo_data = x + # Run the optimization: + study.optimize(self.objective,n_trials=self.n_hpo_trials,gc_after_trial=True) + + # And visualize everything: + self.visualize_search(study) + + # Write the 'final' configuration to file: + cfg_loc = self.hpo_result_folder + "/best_model_settings" + self.model.save_config(cfg_loc) + + # Finally, train the 'best' model for a few more epochs: + results = self.model.train( + x=x, + n_epochs=self.n_epochs, + batch_size=self.batch_size, + validation_split=self.validation_split, + verbosity=self.verbosity + ) + + # Store the model itself: + self.save(self.hpo_result_folder) + + return results + #********************************************* + + # Store / load the network: + #**************************** + # Save the entire model: + def save(self,model_loc): + self.model.save(model_loc) + + #---------------- + + def load(self): + pass + #**************************** + + + # Get the encoder / decoder models themselves: + #********************************************* + def get_model(self,x=None): + return self.model.get_model() + #********************************************* + diff --git a/jlab_datascience_toolkit/models/__init__.py b/jlab_datascience_toolkit/models/__init__.py new file mode 100644 index 0000000..12ea997 --- /dev/null +++ b/jlab_datascience_toolkit/models/__init__.py @@ -0,0 +1,13 @@ +from jlab_datascience_toolkit.utils.registration import register, make, list_registered_modules + +# Keras CNN Autoencoder: +register( + id="KerasCNNAE_v0", + entry_point="jlab_datascience_toolkit.models.keras_cnn_ae:KerasCNNAE" +) + +# HPO for Keras CNN Autoencoder: +register( + id="HPOKerasCNNAE_v0", + entry_point="jlab_datascience_toolkit.hyper_parameter_tuning.hpo_keras_cnn_ae:HPOKerasCNNAE" +) \ No newline at end of file diff --git a/jlab_datascience_toolkit/models/keras_cnn_ae.py b/jlab_datascience_toolkit/models/keras_cnn_ae.py new file mode 100644 index 0000000..0720252 --- /dev/null +++ b/jlab_datascience_toolkit/models/keras_cnn_ae.py @@ -0,0 +1,408 @@ +from jlab_datascience_toolkit.core.jdst_model import JDSTModel +import tensorflow as tf +from tensorflow import keras +import numpy as np +import gc +import logging +import yaml +import os +import inspect +from sklearn.model_selection import train_test_split +from keras.callbacks import ModelCheckpoint +from jlab_datascience_toolkit.utils.architectures.keras_cnn_ae_architecture import KerasCNNAEArchitecture +from jlab_datascience_toolkit.utils.keras_callbacks.keras_early_stopping import KerasEarlyStopping +from jlab_datascience_toolkit.utils.keras_callbacks.keras_garbage_handler import KerasGarbageHandler + +class KerasCNNAE(keras.Model,JDSTModel): + ''' + Class for setting up an Autoencoder with convolutional and optionally dense layers. This class uses the KerasCNNArchitecture class + to set up the architecture. + ''' + + # Initialize: + #**************************** + def __init__(self,path_to_cfg,user_config={}): + super(KerasCNNAE, self).__init__() + + # Set the name specific to this module: + self.module_name = "keras_cnn_ae" + + # Load the configuration: + self.config = self.load_config(path_to_cfg,user_config) + + # Save this config, if a path is provided: + if 'store_cfg_loc' in self.config: + self.save_config(self.config['store_cfg_loc']) + + # Retreive settings from configuration: + precision = self.config['precision'] + # Get the architecture class: + self.ae_architecture = KerasCNNAEArchitecture(precision) + + # Model storage / loading and model format: + self.store_model_loc = self.config['store_model_loc'] + self.load_model_loc = self.config['load_model_loc'] + self.model_store_format = self.config['model_store_format'] + self.compile_loaded_model = self.config['compile_loaded_model'] + + # Get the image dimensions --> Important for setting the network architecture properly: + self.image_dims = self.config['image_dimensions'] + + # NETWORK ARCHITECTURE AND FEATURES: + + # Down / Up-sampling of the image, in case it is too large: + max_pooling = self.config['max_pooling'] + # Convolutional units: + conv_architecture = self.config['conv_architecture'] + conv_activations = self.config['conv_activations'] + conv_kernel_inits = self.config['conv_kernel_inits'] + conv_bias_inits = self.config['conv_bias_inits'] + kernel_sizes = self.config['kernel_sizes'] + strides = self.config['strides'] + + # Run consistency check on strides and maximm pooling: + strides, d_shape_x, d_shape_y = self.inspect_pooling_and_strides(self.image_dims,max_pooling,strides) + + # Dense units: + dense_architecture = self.config['dense_architecture'] + dense_activations = self.config['dense_activations'] + dense_kernel_inits = self.config['dense_kernel_inits'] + dense_bias_inits = self.config['dense_bias_inits'] + + # Latent dimension: + latent_dim = self.config['latent_dim'] + latent_space_is_2d = self.config['latent_space_is_2d'] + + # Handle reshaping the inputs for the decoder (going from flat to Conv2D): + decoder_conv_reshape_units = self.config['decoder_conv_reshape_units'] + decoder_conv_reshape = [int(d_shape_x),int(d_shape_y),decoder_conv_reshape_units] + + # Response of decoder output layer: + output_activation = self.config['output_activation'] + output_filter = self.image_dims[2] + output_kernel_size = self.config['output_kernel_size'] + output_strides = self.config['output_strides'] + + # OPTIMIZER AND LOSS FUNCTION: + self.learning_rate = self.config['learning_rate'] + self.optimizer_str = self.config['optimizer'] + self.loss_function_str = self.config['loss_function'] + + # Make sure that decoder outputs are set properly, if we decide to work with logits: + self.use_logits = False + if self.loss_function_str.lower() == "logit_bce": + output_activation = "linear" + self.use_logits = True + + # TRAINING: + self.n_epochs = self.config['n_epochs'] + self.batch_size = self.config['batch_size'] + self.validation_split = self.config['validation_split'] + self.verbosity = self.config['verbosity'] + + # Add early stopping callback (if config is properly set): + self.early_stopping = KerasEarlyStopping(self.config).get_callback() + + # BUILD THE MODEL: + # Check if the model already exists and just needs to be loaded: + if self.load_model_loc is not None: + self.load(self.load_model_loc) + else: + # Encoder: + self.encoder = self.ae_architecture.get_encoder( + input_dimensions=self.image_dims, + conv_architecture=conv_architecture, + conv_activations=conv_activations, + conv_kernel_inits=conv_kernel_inits, + conv_bias_inits=conv_bias_inits, + kernel_sizes=kernel_sizes, + strides=strides, + dense_architecture=dense_architecture, + dense_activations=dense_activations, + dense_kernel_inits=dense_kernel_inits, + dense_bias_inits=dense_bias_inits, + latent_dim=latent_dim, + latent_activation='linear', + latent_kernel_init='glorot_normal', + latent_is_2d=latent_space_is_2d, + max_pooling=max_pooling, + encoder_name="Encoder" + ) + + # Decoder: + self.decoder = self.ae_architecture.get_decoder( + latent_dim=latent_dim, + latent_is_2d=latent_space_is_2d, + reshape_dimensions=decoder_conv_reshape, + conv_architecture=conv_architecture[::-1], + conv_activations=conv_activations[::-1], + conv_kernel_inits=conv_kernel_inits[::-1], + conv_bias_inits=conv_bias_inits[::-1], + kernel_sizes=kernel_sizes[::-1], + strides=strides[::-1], + dense_architecture=dense_architecture[::-1], + dense_activations=dense_activations[::-1], + dense_kernel_inits=dense_kernel_inits[::-1], + dense_bias_inits=dense_bias_inits[::-1], + output_filter=output_filter, + output_kernel_size=output_kernel_size, + output_strides=output_strides, + output_activation=output_activation, + max_pooling=max_pooling, + decoder_name="Decoder" + ) + + # Compile the model: + self.compile() + #**************************** + + # Pass on the data type check for now, as tensorflow allows a variety of data types + # see here: https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit + #**************************** + def check_input_data_type(self): + pass + #**************************** + + # Run dimensional check on max. pooling and strides + #**************************** + def inspect_pooling_and_strides(self,idims,max_pooling,stride_list): + new_stride_list = [] + xdim = idims[0] + ydim = idims[1] + + # Apply max. pooling + if max_pooling is not None and max_pooling[0] > 0 and max_pooling[1] > 0: + xdim /= max_pooling[0] + ydim /= max_pooling[1] + + #++++++++++++++++++++ + for s in stride_list: + t_x = xdim % s + t_y = ydim % s + + if t_x == 0 and t_y == 0: + xdim /= s + ydim /= s + new_stride_list.append(s) + else: + new_stride_list.append(1) + #++++++++++++++++++++ + + return new_stride_list, xdim,ydim + #**************************** + + # Compile the model: + #**************************** + def compile(self): + super(KerasCNNAE, self).compile() + # Register the components: + + # Specify optimizer and loss function: + self.optimizer = None + self.loss_fn = None + + if self.optimizer_str.lower() == "adam": + self.optimizer = keras.optimizers.Adam(self.learning_rate) + + if self.optimizer_str.lower() == "legacy_adam": + self.optimizer = keras.optimizers.legacy.Adam(self.learning_rate) + + if self.optimizer_str.lower() == "sgd": + self.optimizer = keras.optimizers.SGD(self.learning_rate) + + if self.loss_function_str.lower() == "mse": + self.loss_fn = keras.losses.MeanSquaredError() + + if self.loss_function_str.lower() == "mae": + self.loss_fn = keras.losses.MeanAbsoluteError() + + if self.loss_function_str.lower() == "huber": + self.loss_fn = keras.losses.Huber() + + if self.loss_function_str.lower() == "logit_bce": + def loss_func(x,x_logit): + return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)) + + self.loss_fn = loss_func + #**************************** + + # Get Model response: + #**************************** + # Call: + def call(self,x): + z = self.encoder(x) + return self.decoder(z) + + #------------------------- + + # Predict: + def predict(self,x,to_numpy=True): + z = self.encoder(x) + x_rec = self.decoder(z) + + if self.use_logits == True: + x_rec = tf.sigmoid(x_rec) + + gc.collect() + + if to_numpy == True: + return{ + 'z_model':z.numpy(), + 'x_rec':x_rec.numpy() + } + + return{ + 'z_model':z, + 'x_rec':x_rec + } + #**************************** + + # Autoencoder training: + #**************************** + # Train step: + @tf.function + def train_step(self,x): + with tf.GradientTape() as tape: + z = self.encoder(x) + x_rec = self.decoder(z) + loss = self.loss_fn(x,x_rec) + + gradients = tape.gradient(loss,self.trainable_variables) + self.optimizer.apply_gradients(zip(gradients,self.trainable_variables)) + + return { + 'loss':loss, + } + + #------------------------- + + @tf.function + def test_step(self,x): + z = self.encoder(x) + x_rec = self.decoder(z) + loss = self.loss_fn(x,x_rec) + + return { + 'loss':loss + } + + #------------------------- + + # Entire fit function: + def train(self,x,n_epochs=None,batch_size=None,validation_split=None,verbosity=None): + # Use the default settings, if no explicit ones are provided: + if n_epochs is None: + n_epochs = self.n_epochs + + if batch_size is None: + batch_size = self.batch_size + + if validation_split is None: + validation_split = self.validation_split + + if verbosity is None: + verbosity = self.verbosity + + # Divide the data in training and validations data: + x_train, x_test = train_test_split(x,test_size=validation_split) + + # Handle callbacks: + ae_callbacks = [tf.keras.callbacks.TerminateOnNaN(),KerasGarbageHandler()] + if self.early_stopping is not None: + ae_callbacks.append(self.early_stopping) + + results = super(KerasCNNAE, self).fit( + x=x_train, + y=None, + validation_data=(x_test,None), + batch_size=batch_size, + epochs=n_epochs, + shuffle=True, + callbacks=ae_callbacks, + verbose=verbosity + ) + + outputs = {} + #+++++++++++++++++++ + for key in results.history: + outputs[key] = results.history[key] + #+++++++++++++++++++ + + # Store the model, if a path is provided: + if self.store_model_loc is not None and self.store_model_loc != "": + os.makedirs(self.store_model_loc,exist_ok=True) + + self.save(self.store_model_loc) + self.save_config(self.store_model_loc+"/keras_cnn_ae_cfg.yaml") + + return outputs + #**************************** + + # Store / load the network: + #**************************** + # Save the entire model: + def save(self,model_loc): + self.encoder.save(model_loc+"/keras_cnn_ae_encoder"+self.model_store_format) + self.decoder.save(model_loc+"/keras_cnn_ae_decoder"+self.model_store_format) + + #---------------- + + def load(self,model_loc): + self.encoder = keras.models.load_model(model_loc+"/keras_cnn_ae_encoder"+self.model_store_format) + self.decoder = keras.models.load_model(model_loc+"/keras_cnn_ae_decoder"+self.model_store_format) + + # Check if re-compilation is required: + if self.compile_loaded_model == True: + # We need to keep track of the weights, otherwise, they are lost after compilation: + encoder_weights = self.encoder.get_weights() + decoder_weights = self.decoder.get_weights() + + self.compile() + + self.encoder.set_weights(encoder_weights) + self.decoder.set_weights(decoder_weights) + #**************************** + + # Provide information about this module: + #********************************************* + def get_info(self): + print(inspect.getdoc(self)) + #********************************************* + + # Handle configurations: + #********************************************* + # Load the config: + def load_config(self,path_to_cfg,user_config): + with open(path_to_cfg, 'r') as file: + cfg = yaml.safe_load(file) + + # Overwrite config with user settings, if provided + try: + if bool(user_config): + #++++++++++++++++++++++++ + for key in user_config: + cfg[key] = user_config[key] + #++++++++++++++++++++++++ + except: + logging.exception(">>> " + self.module_name +": Invalid user config. Please make sure that a dictionary is provided <<<") + + return cfg + + #----------------------------- + + # Store the config: + def save_config(self,path_to_config): + with open(path_to_config, 'w') as file: + yaml.dump(self.config, file) + #********************************************* + + # Get the encoder / decoder models themselves: + #********************************************* + def get_model(self,x=None): + return [self.encoder,self.decoder] + #********************************************* + + + + + \ No newline at end of file diff --git a/jlab_datascience_toolkit/utils/architectures/keras_cnn_ae_architecture.py b/jlab_datascience_toolkit/utils/architectures/keras_cnn_ae_architecture.py new file mode 100644 index 0000000..614acf1 --- /dev/null +++ b/jlab_datascience_toolkit/utils/architectures/keras_cnn_ae_architecture.py @@ -0,0 +1,312 @@ +import tensorflow as tf +from tensorflow import keras + +class KerasCNNAEArchitecture(object): + ''' + Specify the architecture of a (Variatonal) AutoEncoder + ''' + + # Initialize: + #************************* + def __init__(self,precision='float32'): + self.precision = precision + #************************* + + # Helper function to set / register activation functions: + #************************* + def get_activation_function(self,act_fn_str,name): + if act_fn_str.lower() == "leaky_relu": + return keras.layers.Activation(tf.nn.leaky_relu,name=name) + + return keras.layers.Activation(act_fn_str.lower(),name=name,dtype=tf.keras.mixed_precision.Policy(self.precision)) + #************************* + + # Encoder: + #************************* + def get_encoder(self,input_dimensions,conv_architecture,conv_activations,conv_kernel_inits,conv_bias_inits,kernel_sizes,strides,dense_architecture,dense_activations,dense_kernel_inits,dense_bias_inits,latent_dim,latent_activation,latent_kernel_init,latent_is_2d,max_pooling,encoder_name): + # Get the number of conv. / dense layers: + n_conv_layers = len(conv_architecture) + n_dense_layers = len(dense_architecture) + + # Define the encoder input: + encoder_inputs = keras.layers.Input( + shape=input_dimensions, + name=encoder_name+"_input" + ) + + # Apply max-pooling, if requested: + x_enc = encoder_inputs + if max_pooling is not None and max_pooling[0] > 0 and max_pooling[1] > 0: + x_enc = keras.layers.MaxPooling2D( + pool_size=(max_pooling[0],max_pooling[1]), + dtype=tf.keras.mixed_precision.Policy(self.precision) + )(x_enc) + + # Add conv. parts: + #++++++++++++++++++++++++++ + for k in range(n_conv_layers): + conv_laver = keras.layers.Conv2D( + filters=conv_architecture[k], + kernel_size=kernel_sizes[k], + strides=strides[k], + kernel_initializer=conv_kernel_inits[k], + bias_initializer=conv_bias_inits[k], + name=encoder_name+"_conv"+str(k), + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + conv_activation = self.get_activation_function(conv_activations[k],name=encoder_name+"_conv_act"+str(k)) + + x_enc = conv_laver(x_enc) + x_enc = conv_activation(x_enc) + #++++++++++++++++++++++++++ + + # Add a flattening layer: + x_enc = keras.layers.Flatten(dtype=tf.keras.mixed_precision.Policy(self.precision))(x_enc) + + # Add a dense part, if wanted: + if n_dense_layers > 0: + #++++++++++++++++++++++++++ + for d in range(n_dense_layers): + dense_layer = keras.layers.Dense( + units=dense_architecture[d], + kernel_initializer=dense_kernel_inits[d], + bias_initializer=dense_bias_inits[d], + name=encoder_name+"_dense"+str(d), + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + dense_activation = self.get_activation_function(dense_activations[d],name=encoder_name+"_dense_act"+str(d)) + + x_enc = dense_layer(x_enc) + x_enc = dense_activation(x_enc) + #++++++++++++++++++++++++++ + + # Check if the latent layer is two dimenstional: + if latent_is_2d == True: + # Handle the encoding / latent layer: + encoding_layer = keras.layers.Dense( + units=latent_dim*latent_dim, + activation=latent_activation, + name=encoder_name+"_latent_layer", + kernel_initializer=latent_kernel_init, + bias_initializer="zeros", + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + encoder_outputs = encoding_layer(x_enc) + encoder_outputs = keras.layers.Reshape(target_shape=(latent_dim,latent_dim))(encoder_outputs) + return keras.models.Model(encoder_inputs,encoder_outputs,name=encoder_name) + + else: # Or not... + + # Handle the encoding / latent layer: + encoding_layer = keras.layers.Dense( + units=latent_dim, + activation=latent_activation, + name=encoder_name+"_latent_layer", + kernel_initializer=latent_kernel_init, + bias_initializer="zeros", + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + encoder_outputs = encoding_layer(x_enc) + return keras.models.Model(encoder_inputs,encoder_outputs,name=encoder_name) + #************************* + + # Decoder: + #************************* + def get_decoder(self,latent_dim,latent_is_2d,reshape_dimensions,conv_architecture,conv_activations,conv_kernel_inits,conv_bias_inits,kernel_sizes,strides,dense_architecture,dense_activations,dense_kernel_inits,dense_bias_inits,output_filter,output_kernel_size,output_strides,output_activation,max_pooling,decoder_name): + # Get the number of conv. / dense layers: + n_conv_layers = len(conv_architecture) + n_dense_layers = len(dense_architecture) + + # Define the decoder inputs: + decoder_inputs = None + z_dec = None + + # First, check if the latent input is 2D matrix: + if latent_is_2d == True: + decoder_inputs = keras.layers.Input( + shape=(latent_dim,latent_dim,), + name=decoder_name+"_input" + ) + z_dec = keras.layers.Flatten()(decoder_inputs) + else: + decoder_inputs = keras.layers.Input( + shape=(latent_dim,), + name=decoder_name+"_input" + ) + z_dec = decoder_inputs + + # Add a dense part, if requested: + if n_dense_layers > 0: + #++++++++++++++++++++++++++ + for d in range(n_dense_layers): + dense_layer = keras.layers.Dense( + units=dense_architecture[d], + kernel_initializer=dense_kernel_inits[d], + bias_initializer=dense_bias_inits[d], + name=decoder_name+"_dense"+str(d), + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + dense_activation = self.get_activation_function(dense_activations[d],name=decoder_name+"_dense_act"+str(d)) + + z_dec = dense_layer(z_dec) + z_dec = dense_activation(z_dec) + #++++++++++++++++++++++++++ + + # Now we need to reshape in order to translate everything back + # from the 1D latent space to the conv. space + reshaping_layer = keras.layers.Dense( + units=reshape_dimensions[0]*reshape_dimensions[1]*reshape_dimensions[2], + activation='relu', + kernel_initializer='he_normal', + bias_initializer='zeros', + name=decoder_name+'_conv_reshape', + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + z_dec = reshaping_layer(z_dec) + # Now convert everything to 2D: + z_dec = keras.layers.Reshape(target_shape=reshape_dimensions)(z_dec) + + # Translate everything back via transpose conv.: + #++++++++++++++++++++++++++ + for k in range(n_conv_layers): + transpose_conv_layer = keras.layers.Conv2DTranspose( + filters=conv_architecture[k], + kernel_size=kernel_sizes[k], + strides=strides[k], + padding='same', + kernel_initializer=conv_kernel_inits[k], + bias_initializer=conv_bias_inits[k], + name=decoder_name+"_convT"+str(k), + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + transpose_conv_activation = self.get_activation_function(conv_activations[k],name=decoder_name+"_convT_act"+str(k)) + + z_dec = transpose_conv_layer(z_dec) + z_dec = transpose_conv_activation(z_dec) + #++++++++++++++++++++++++++ + + # Add an output layer: + output_layer = keras.layers.Conv2DTranspose( + filters=output_filter, + kernel_size=output_kernel_size, + strides=output_strides, + activation=output_activation, + padding='same', + name=decoder_name+"_output", + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + x_rec = output_layer(z_dec) + + # Undo the max. pooling, if existent: + if max_pooling is not None and max_pooling[0] > 0 and max_pooling[1] > 0: + x_rec = keras.layers.UpSampling2D( + size=(max_pooling[0],max_pooling[1]), + interpolation='nearest', + dtype=tf.keras.mixed_precision.Policy(self.precision) + )(x_rec) + + return keras.models.Model(decoder_inputs,x_rec,name=decoder_name) + #************************* + + + + # Experimental: Define encoder with conditional input --> So that we can use it for a diffusion model: + + # Encoder for a diffusion model: + #************************* + def get_encoder_for_diffusion(self,input_dimensions,input_dimensions2,input2_processing_fn,conv_architecture,conv_activations,conv_kernel_inits,conv_bias_inits,kernel_sizes,strides,dense_architecture,dense_activations,dense_kernel_inits,dense_bias_inits,latent_dim,latent_activation,latent_kernel_init,latent_is_2d,max_pooling,encoder_name): + # Get the number of conv. / dense layers: + n_conv_layers = len(conv_architecture) + n_dense_layers = len(dense_architecture) + + # Define the encoder input: + encoder_inputs = keras.layers.Input( + shape=input_dimensions, + name=encoder_name+"_input" + ) + + encoder_inputs_2 = keras.layers.Input( + shape=input_dimensions2, + name=encoder_name+"_input_2" + ) + + # Apply max-pooling, if requested: + x_enc = encoder_inputs + if max_pooling is not None and max_pooling[0] > 0 and max_pooling[1] > 0: + x_enc = keras.layers.MaxPooling2D( + pool_size=(max_pooling[0],max_pooling[1]), + dtype=tf.keras.mixed_precision.Policy(self.precision) + )(x_enc) + + x_enc_2 = input2_processing_fn(x_enc,encoder_inputs_2) + x_enc = keras.layers.Concatenate()([x_enc,x_enc_2]) + + # Add conv. parts: + #++++++++++++++++++++++++++ + for k in range(n_conv_layers): + conv_laver = keras.layers.Conv2D( + filters=conv_architecture[k], + kernel_size=kernel_sizes[k], + strides=strides[k], + kernel_initializer=conv_kernel_inits[k], + bias_initializer=conv_bias_inits[k], + name=encoder_name+"_conv"+str(k), + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + conv_activation = self.get_activation_function(conv_activations[k],name=encoder_name+"_conv_act"+str(k)) + + x_enc = conv_laver(x_enc) + x_enc = conv_activation(x_enc) + #++++++++++++++++++++++++++ + + # Add a flattening layer: + x_enc = keras.layers.Flatten(dtype=tf.keras.mixed_precision.Policy(self.precision))(x_enc) + + # Add a dense part, if wanted: + if n_dense_layers > 0: + #++++++++++++++++++++++++++ + for d in range(n_dense_layers): + dense_layer = keras.layers.Dense( + units=dense_architecture[d], + kernel_initializer=dense_kernel_inits[d], + bias_initializer=dense_bias_inits[d], + name=encoder_name+"_dense"+str(d), + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + dense_activation = self.get_activation_function(dense_activations[d],name=encoder_name+"_dense_act"+str(d)) + + x_enc = dense_layer(x_enc) + x_enc = dense_activation(x_enc) + #++++++++++++++++++++++++++ + + # Check if the latent layer is two dimenstional: + if latent_is_2d == True: + # Handle the encoding / latent layer: + encoding_layer = keras.layers.Dense( + units=latent_dim*latent_dim, + activation=latent_activation, + name=encoder_name+"_latent_layer", + kernel_initializer=latent_kernel_init, + bias_initializer="zeros", + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + encoder_outputs = encoding_layer(x_enc) + encoder_outputs = keras.layers.Reshape(target_shape=(latent_dim,latent_dim))(encoder_outputs) + return keras.models.Model(encoder_inputs,encoder_outputs,name=encoder_name) + + else: # Or not... + + # Handle the encoding / latent layer: + encoding_layer = keras.layers.Dense( + units=latent_dim, + activation=latent_activation, + name=encoder_name+"_latent_layer", + kernel_initializer=latent_kernel_init, + bias_initializer="zeros", + dtype=tf.keras.mixed_precision.Policy(self.precision) + ) + encoder_outputs = encoding_layer(x_enc) + return keras.models.Model(inputs=[encoder_inputs,encoder_inputs_2],outputs=encoder_outputs,name=encoder_name) + #************************* + + diff --git a/jlab_datascience_toolkit/utils/check_internet_connection.py b/jlab_datascience_toolkit/utils/check_internet_connection.py new file mode 100644 index 0000000..93fa049 --- /dev/null +++ b/jlab_datascience_toolkit/utils/check_internet_connection.py @@ -0,0 +1,29 @@ +import socket + +# check internet connection: this is useful as OUD RRCE does not allow internet access +def internet_available(host="8.8.8.8", port=53, timeout=3): + """ + Host: 8.8.8.8 (google-public-dns-a.google.com) + OpenPort: 53/tcp + Service: domain (DNS/TCP) + Check if internet is available by attempting to connect to a known server. + """ + try: + socket.setdefaulttimeout(timeout) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.connect((host, port)) + + print("Internet connection available") + return True + + except socket.timeout: + print("No Internet: Connection timed out") + return False + + except socket.gaierror: + print("No Internet: Address-related error connecting to server") + return False + + except socket.error as e: + print(f"No Internet: Network error: {e}") + return False diff --git a/jlab_datascience_toolkit/utils/get_mnist.py b/jlab_datascience_toolkit/utils/get_mnist.py new file mode 100644 index 0000000..1809ed2 --- /dev/null +++ b/jlab_datascience_toolkit/utils/get_mnist.py @@ -0,0 +1,39 @@ +from jlab_datascience_toolkit.utils.check_internet_connection import internet_available + +import tensorflow as tf +import numpy as np +import os + +def get_mnist_data(): + """ + Load MNIST data based on internet availability. + """ + if internet_available(): + try: + (x_train, y_train), (x_val, y_val) = tf.keras.datasets.mnist.load_data() + print("MNIST dataset has been downloaded.") + except Exception as e: + print(f"Failed to download MNIST dataset due to an error: {e}") + x_train, y_train, x_val, y_val = load_local_mnist() + else: + print("Trying to load dataset locally...") + x_train, y_train, x_val, y_val = load_local_mnist() + + return x_train, y_train, x_val, y_val + +def load_local_mnist(): + """ + Helper function to load MNIST data locally; assumes data is within the repo. + """ + current_file_path = os.path.dirname(os.path.realpath(__file__)) + data_path = os.path.join(current_file_path, '..', 'data', 'example_data', 'mnist_data') + + if os.path.exists(data_path): + x_train = np.load(os.path.join(data_path, 'x_train.npy')) + x_val = np.load(os.path.join(data_path, 'x_val.npy')) + y_train = np.load(os.path.join(data_path, 'y_train.npy')) + y_val = np.load(os.path.join(data_path, 'y_val.npy')) + print("... Loaded MNIST data from local file.") + else: + raise FileNotFoundError("Local MNIST file not found. Please check the path.") + return x_train, y_train, x_val, y_val \ No newline at end of file diff --git a/jlab_datascience_toolkit/utils/graph_driver_utils.py b/jlab_datascience_toolkit/utils/graph_driver_utils.py new file mode 100644 index 0000000..07e79f7 --- /dev/null +++ b/jlab_datascience_toolkit/utils/graph_driver_utils.py @@ -0,0 +1,82 @@ +from typing import NamedTuple, Union, Iterable +from jlab_datascience_toolkit.utils.registration import make +import jlab_datascience_toolkit.data_parsers +import jlab_datascience_toolkit.data_preps +import jlab_datascience_toolkit.models +import jlab_datascience_toolkit.analyses + +''' +This class was developed by Steven Goldenberg (sgolden@jlab.org) and helps set up a generic driver in an elegant way. +''' + +class GraphRuntime(): + class Edge(NamedTuple): + input: Union[str, tuple] + function: str + output: Union[str, tuple] + + def tuples_to_edges(self,tuple_list): + edges = [] + for tuple in tuple_list: + edges.append(self.Edge(*tuple)) + + return edges + + def get_distinct_data_dict(self, graph_edges): + distinct_data = set() + for edge in graph_edges: + if edge.input is not None: + if isinstance(edge.input, str): + distinct_data.add(edge.input) + else: + [distinct_data.add(val) for val in edge.input] + if edge.output is not None: + if isinstance(edge.output, str): + distinct_data.add(edge.output) + else: + [distinct_data.add(val) for val in edge.output] + + return dict.fromkeys(distinct_data, None) + + def get_module_dict(self,modules,config_paths,user_configs): + module_dict = dict.fromkeys(modules, None) + for m_name in module_dict: + module_id = modules[m_name] + print(f'Making {m_name} with module ID: {module_id}') + module_dict[m_name] = make(module_id,path_to_cfg=config_paths[m_name],user_config=user_configs[m_name]) + + return module_dict + + def run_graph(self, graph, modules,config_paths,user_configs): + graph_edges = self.tuples_to_edges(graph) + data = self.get_distinct_data_dict(graph_edges) + module_dict = self.get_module_dict(modules,config_paths,user_configs) + for edge in graph_edges: + + if '.' in edge.function: + m_name, fn_call = edge.function.split('.') + fn = getattr(module_dict[m_name], fn_call) + else: + fn = getattr(self, edge.function) + + if edge.input is None: + fn_in = [] #Unpacks to 0 arguments + elif isinstance(edge.input, str): + fn_in = [data[edge.input]] # Unpacks to 1 argument + elif isinstance(edge.input, Iterable): + fn_in = [data[val] for val in edge.input] + + # Take advantage of list unpacking for arguments + out = fn(*fn_in) + + if out is not None: + if isinstance(edge.output, tuple): + for o, d in zip(out, edge.output): + data[d] = o + else: + data[edge.output] = out + + return data, module_dict + + def combine(self, *inputs): + return inputs \ No newline at end of file diff --git a/jlab_datascience_toolkit/utils/keras_callbacks/keras_early_stopping.py b/jlab_datascience_toolkit/utils/keras_callbacks/keras_early_stopping.py new file mode 100644 index 0000000..3127091 --- /dev/null +++ b/jlab_datascience_toolkit/utils/keras_callbacks/keras_early_stopping.py @@ -0,0 +1,34 @@ +from tensorflow import keras + +class KerasEarlyStopping(object): + + # Initialize: + #**************************** + def __init__(self,config): + self.monitor = config.get("early_stopping_monitor",None) + self.min_delta = config.get("early_stopping_min_delta",-1.0) + self.patience = config.get("early_stopping_patience",0) + self.verbose = config.get("early_stopping_verbose",0) + self.mode = config.get("early_stopping_mode",'auto') + self.baseline = config.get("early_stopping_baseline",None) + self.best_weights = config.get("early_stopping_restore_best_weights",False) + self.start_epoch = config.get("early_stopping_start_epoch",0) + #**************************** + + # Provide the callback + #**************************** + def get_callback(self): + if self.monitor is None or self.min_delta < 0.0: + return None + else: + return keras.callbacks.EarlyStopping( + monitor=self.monitor, + min_delta=self.min_delta, + patience=self.patience, + verbose=self.verbose, + mode=self.mode, + baseline=self.baseline, + restore_best_weights=self.best_weights, + start_from_epoch=self.start_epoch + ) + #**************************** \ No newline at end of file diff --git a/jlab_datascience_toolkit/utils/keras_callbacks/keras_garbage_handler.py b/jlab_datascience_toolkit/utils/keras_callbacks/keras_garbage_handler.py new file mode 100644 index 0000000..e170f97 --- /dev/null +++ b/jlab_datascience_toolkit/utils/keras_callbacks/keras_garbage_handler.py @@ -0,0 +1,16 @@ +import tensorflow as tf +import gc + +# Try to keep track of the memory consumption during the training phase: +# This code was taken from: +# https://stackoverflow.com/questions/64666917/optuna-memory-issues +class KerasGarbageHandler(tf.keras.callbacks.Callback): + + # Clear memory at the end of every training epoch: + #****************************** + def on_epoch_end(self, epoch, logs=None): + gc.collect() + #****************************** + + + diff --git a/utests/utest_data_reconstruction.py b/utests/utest_data_reconstruction.py new file mode 100644 index 0000000..8e46231 --- /dev/null +++ b/utests/utest_data_reconstruction.py @@ -0,0 +1,131 @@ +import jlab_datascience_toolkit.models as models +import jlab_datascience_toolkit.analyses as analyses +from jlab_datascience_toolkit.utils.get_mnist import get_mnist_data +import unittest +import numpy as np +import matplotlib.pyplot as plt +import os +import tensorflow as tf + +class UTestDataReconstruction(unittest.TestCase): + + # Initialize: + #***************************************** + def __init__(self,*args, **kwargs): + super(UTestDataReconstruction,self).__init__(*args, **kwargs) + + # Get an intro: + print(" ") + print("*************************************") + print("* *") + print("* Data Reconstruction Unit-Test *") + print("* *") + print("*************************************") + print(" ") + + # First,get the MNIST data set offline or online: + print("Get MNIST data...") + + data, _, _, _ = get_mnist_data() + + print("...done!") + print(" ") + + # Do some minor preprocessing: + print("Preprocess data...") + + data = data.reshape((data.shape[0], 28, 28, 1)) / 255. + self.data = np.where(data > .5, 1.0, 0.0).astype('float32') + + print("...done!") + print(" ") + #***************************************** + + # Test drive the model: + #***************************************** + def test_data_reconstruction(self): + # Set the model id: + model_id = 'KerasCNNAE_v0' + + # Store the results of this unit-test somewhere: + result_loc = 'results_utest_data_reconstruction' + os.makedirs(result_loc,exist_ok=True) + + # Get the default configuration: + this_file_loc = os.path.dirname(__file__) + model_cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/keras_cnn_ae_cfg.yaml') + + # And maybe add some user specific values: + use_conv_latent_layer = False + model_user_cfg = { + 'image_dimensions':(self.data.shape[1],self.data.shape[2],self.data.shape[3]), + 'n_epochs': 10, + 'dense_architecture':[3,3], + 'dense_activations':['relu']*2, + 'dense_kernel_inits':['he_normal']*2, + 'dense_bias_inits':['he_normal']*2, + 'latent_space_is_2d':use_conv_latent_layer, + 'optimizer':'legacy_adam', + 'early_stopping_monitor':'val_loss', + 'early_stopping_min_delta':0.00005, + 'early_stopping_patience':3, + 'early_stopping_restore_best_weights':True, + } + + print("Set up model...") + + # Set the model: + model = models.make(model_id,path_to_cfg=model_cfg_loc,user_config=model_user_cfg) + + # And the model list: + model_list = model.get_model() + + print("...done!") + print(" ") + + # Get id for analysis module: + ana_id = "DataReconstruction_v0" + + # Specify configuration: + ana_cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/data_reconstruction_cfg.yaml') + + data_fraction_to_analyze = 200 + ana_user_cfg = { + 'output_loc': result_loc, + 'analysis_sample_size': 50, + 'n_analysis_samples':data_fraction_to_analyze + } + + # Load the module: + print("Load data reconstruction module...") + + analyzer = analyses.make(ana_id,path_to_cfg=ana_cfg_loc,user_config=ana_user_cfg) + + print("...done!") + print(" ") + + # Run the reconstruction: + print("Run reconstruction...") + + rec_data = analyzer.run(self.data,model_list) + + print("...done!") + print(" ") + + # We compare the shapes of the input and reconstructed data... Ideally they should be equal... + pass_dim_check = False + if data_fraction_to_analyze > 0: + if rec_data['x_rec'].shape[0] == rec_data['x_orig'].shape[0] and rec_data['x_rec'].shape[1] == rec_data['x_orig'].shape[1] and rec_data['x_rec'].shape[2] == rec_data['x_orig'].shape[2]: + pass_dim_check = True + + else: + if rec_data['x_rec'].shape[0] == self.data.shape[0] and rec_data['x_rec'].shape[1] == self.data.shape[1] and rec_data['x_rec'].shape[2] == self.data.shape[2]: + pass_dim_check = True + + self.assertTrue(pass_dim_check) + #***************************************** + + +# Run this file via: python utest_data_reconstruction.py +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/utests/utest_hpo_keras_cnn_ae.py b/utests/utest_hpo_keras_cnn_ae.py new file mode 100644 index 0000000..d5683d2 --- /dev/null +++ b/utests/utest_hpo_keras_cnn_ae.py @@ -0,0 +1,149 @@ +import jlab_datascience_toolkit.models as models +from jlab_datascience_toolkit.utils.get_mnist import get_mnist_data +import unittest +import numpy as np +import matplotlib.pyplot as plt +import os +import tensorflow as tf + +class UTestHPOKerasCNNAE(unittest.TestCase): + + # Initialize: + #***************************************** + def __init__(self,*args, **kwargs): + super(UTestHPOKerasCNNAE,self).__init__(*args, **kwargs) + + # Get an intro: + print(" ") + print("**********************************") + print("* *") + print("* HPO Keras CNN AE Unit-Test *") + print("* *") + print("**********************************") + print(" ") + + # First,get the MNIST data set offline or online: + print("Get MNIST data...") + + data, _, _, _ = get_mnist_data() + + print("...done!") + print(" ") + + # Do some minor preprocessing: + print("Preprocess data...") + + data = data.reshape((data.shape[0], 28, 28, 1)) / 255. + self.data = np.where(data > .5, 1.0, 0.0).astype('float32') + + print("...done!") + print(" ") + #***************************************** + + # Test drive the model: + #***************************************** + def test_drive_model(self): + # Set the model id: + model_id = 'HPOKerasCNNAE_v0' + + # Store the results of this unit-test somewhere: + result_loc = 'results_utest_hpo_keras_cnn_ae' + os.makedirs(result_loc,exist_ok=True) + + # Get the default configuration: + this_file_loc = os.path.dirname(__file__) + cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/hpo_keras_cnn_ae_cfg.yaml') + + # And maybe add some user specific values: + use_conv_latent_layer = False + user_cfg = { + 'image_dimensions':(self.data.shape[1],self.data.shape[2],self.data.shape[3]), + 'hpo_result_folder':result_loc, + 'max_pooling':[2,2], + 'optimizer': 'legacy_adam', + 'n_hpo_trials': 10, + 'n_epochs_per_trial': 10, + 'n_epochs': 20 + } + + print("Set up model...") + + # Set the model: + hpo_model = models.make(model_id,path_to_cfg=cfg_loc,user_config=user_cfg) + + print("...done!") + print(" ") + + # Do a short training: + print("Run short training...") + + loss_dict = hpo_model.train(self.data) + + print("...done!") + print(" ") + + print("Test model response...") + + preds = hpo_model.predict(self.data[:6],True) + rec_data = preds['x_rec'] + latent_data = preds['z_model'] + + print("...done!") + print(" ") + + print("Visualize and store results...") + plt.rcParams.update({'font.size':20}) + + # Reconstruction: + figr,axr = plt.subplots(2,6,figsize=(17,8),sharex=True,sharey=True) + + #++++++++++++++++++ + for i in range(6): + axr[0,i].imshow(self.data[i]) + axr[1,i].imshow(rec_data[i]) + #++++++++++++++++++ + + figr.savefig(result_loc+"/reconstructed_data.png") + plt.close(figr) + + # Latent dimension: + if use_conv_latent_layer == True: + figl,axl = plt.subplots(1,6,figsize=(17,8)) + + #++++++++++++++++++ + for i in range(6): + axl[i].imshow(latent_data[i]) + #++++++++++++++++++ + + figl.savefig(result_loc+"/latent_features.png") + plt.close(figl) + + # Training curves: + fig,ax = plt.subplots(figsize=(12,8)) + + ax.plot(loss_dict['loss'],linewidth=3.0,label='Training') + ax.plot(loss_dict['val_loss'],linewidth=3.0,label='Validation') + ax.set_xlabel('Epochs') + ax.set_ylabel('Loss') + ax.grid(True) + ax.legend() + + fig.savefig(result_loc+"/learning_curves.png") + plt.close(fig) + + print("...done!") + print(" ") + + # The loss dict should have two losses: one for training and one for validation. Each should + # be lists with n_epochs elements: + pass_dim_test = False + if len(loss_dict['loss']) == len(loss_dict['val_loss']): + pass_dim_test = True + + self.assertTrue(pass_dim_test) + #***************************************** + + +# Run this file via: python utest_hpo_keras_cnn_ae.py +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/utests/utest_image_to_numpy_parser.py b/utests/utest_image_to_numpy_parser.py new file mode 100644 index 0000000..875b044 --- /dev/null +++ b/utests/utest_image_to_numpy_parser.py @@ -0,0 +1,118 @@ +import jlab_datascience_toolkit.data_parsers as parsers +import unittest +import numpy as np +import matplotlib.pyplot as plt +import os +import tensorflow as tf + +class UTestImageToNumpyParser(unittest.TestCase): + + # Initialize: + #***************************************** + def __init__(self,*args, **kwargs): + super(UTestImageToNumpyParser,self).__init__(*args, **kwargs) + + # Get an into: + print(" ") + print("****************************************") + print("* *") + print("* Unit Test: Image To Numpy Parser *") + print("* *") + print("****************************************") + print(" ") + + # First, we download the MNIST data set: + print("Get MNIST data...") + + (x_train,_), _ = tf.keras.datasets.mnist.load_data() + + print("...done!") + print(" ") + + # Then randomly pick N images: + self.n_images = 100 + + print("Randomly pick " + str(self.n_images) + " images and convert them to .png files...") + + idx = np.random.randint(0,x_train.shape[0],(self.n_images,)) + images = x_train[idx] + + # Store the images locally as .png files: + store_name = 'mnist_image' + # Collect the image names in a list so that we can use them in the parser config: + self.image_names = [] + #+++++++++++++++++++++ + for i in range(self.n_images): + current_name = store_name+str(i) + '.png' + self.image_names.append(current_name) + + fig,ax = plt.subplots() + + ax.imshow(images[i]) + fig.savefig(current_name) + + plt.close(fig) + #+++++++++++++++++++++ + + print("...done!") + print(" ") + + print("Load parser...") + + # Set up the configuration file for the image to numpy parser: + parser_cfg = { + 'image_loc': self.image_names, #--> Provide a list of images + } + this_file_loc = os.path.dirname(__file__) + cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/image_to_numpy_parser_cfg.yaml') + # Now get the parser: + self.parser = parsers.make("ImageToNumpyParser_v0",path_to_cfg=cfg_loc,user_config=parser_cfg) + + # Lets see if we can call the information about this module: + self.parser.get_info() + + print("...done!") + print(" ") + #***************************************** + + # Test everything: + #***************************************** + def test_drive_image_to_numpy_parser(self): + # Get the data: + print("Load data...") + + data = self.parser.load_data() + + print("...done!") + print(" ") + + print("Remove .png files...") + + # Delete the .png files so that we do not spam our machine: + #++++++++++++++++++++ + for i in self.image_names: + os.remove(i) + #++++++++++++++++++++ + + print("...done!") + print(" ") + + # We expect the data to have dimension: n_images x height x width x 3 + # We do not care about height, width as it might vary with the settings in imshow. + # Thus, we check if we have 4 dimensions and if the 1st and 4th dimension match + print("Run consistency check...") + + pass_dimension_check = False + if len(data.shape) == 4 and data.shape[0] == self.n_images and data.shape[3] == 3: + pass_dimension_check = True + + # If everything is done right, this should turn true: + self.assertTrue(pass_dimension_check) + + print("...done!") + print(" ") + #***************************************** + +# Run this file via: python utest_numpy_parser.py +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/utests/utest_keras_cnn_ae.py b/utests/utest_keras_cnn_ae.py new file mode 100644 index 0000000..1e181cb --- /dev/null +++ b/utests/utest_keras_cnn_ae.py @@ -0,0 +1,162 @@ +import jlab_datascience_toolkit.models as models +from jlab_datascience_toolkit.utils.get_mnist import get_mnist_data +import unittest +import numpy as np +import matplotlib.pyplot as plt +import os +import tensorflow as tf + +class UTestKerasCNNAE(unittest.TestCase): + + # Initialize: + #***************************************** + def __init__(self,*args, **kwargs): + super(UTestKerasCNNAE,self).__init__(*args, **kwargs) + + # Get an intro: + print(" ") + print("******************************") + print("* *") + print("* Keras CNN AE Unit-Test *") + print("* *") + print("******************************") + print(" ") + + # First,get the MNIST data set offline or online: + print("Get MNIST data...") + + data, _, _, _ = get_mnist_data() + + print("...done!") + print(" ") + + # Do some minor preprocessing: + print("Preprocess data...") + + data = data.reshape((data.shape[0], 28, 28, 1)) / 255. + self.data = np.where(data > .5, 1.0, 0.0).astype('float32') + + print("...done!") + print(" ") + #***************************************** + + # Test drive the model: + #***************************************** + def test_drive_model(self): + # Set the model id: + model_id = 'KerasCNNAE_v0' + + # Store the results of this unit-test somewhere: + result_loc = 'results_utest_keras_cnn_ae' + os.makedirs(result_loc,exist_ok=True) + + # Get the default configuration: + this_file_loc = os.path.dirname(__file__) + cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/keras_cnn_ae_cfg.yaml') + + # And maybe add some user specific values: + use_conv_latent_layer = False + user_cfg = { + 'image_dimensions':(self.data.shape[1],self.data.shape[2],self.data.shape[3]), + 'n_epochs': 10, + 'dense_architecture':[3,3], + 'dense_activations':['relu']*2, + 'dense_kernel_inits':['he_normal']*2, + 'dense_bias_inits':['he_normal']*2, + 'latent_space_is_2d':use_conv_latent_layer, + 'optimizer':'legacy_adam', + 'early_stopping_monitor':'val_loss', + 'early_stopping_min_delta':0.00005, + 'early_stopping_patience':3, + 'early_stopping_restore_best_weights':True, + } + + print("Set up model...") + + # Set the model: + model = models.make(model_id,path_to_cfg=cfg_loc,user_config=user_cfg) + + print("...done!") + print(" ") + + # Do a short training: + print("Run short training...") + + loss_dict = model.train(self.data) + + print("...done!") + print(" ") + + print("Test model response...") + + preds = model.predict(self.data[:6],True) + rec_data = preds['x_rec'] + latent_data = preds['z_model'] + + print("...done!") + print(" ") + + print("Visualize and store results...") + plt.rcParams.update({'font.size':20}) + + # Reconstruction: + figr,axr = plt.subplots(2,6,figsize=(17,8),sharex=True,sharey=True) + + #++++++++++++++++++ + for i in range(6): + axr[0,i].imshow(self.data[i]) + axr[1,i].imshow(rec_data[i]) + #++++++++++++++++++ + + figr.savefig(result_loc+"/reconstructed_data.png") + plt.close(figr) + + # Latent dimension: + if use_conv_latent_layer == True: + figl,axl = plt.subplots(1,6,figsize=(17,8)) + + #++++++++++++++++++ + for i in range(6): + axl[i].imshow(latent_data[i]) + #++++++++++++++++++ + + figl.savefig(result_loc+"/latent_features.png") + plt.close(figl) + + # Training curves: + fig,ax = plt.subplots(figsize=(12,8)) + + ax.plot(loss_dict['loss'],linewidth=3.0,label='Training') + ax.plot(loss_dict['val_loss'],linewidth=3.0,label='Validation') + ax.set_xlabel('Epochs') + ax.set_ylabel('Loss') + ax.grid(True) + ax.legend() + + fig.savefig(result_loc+"/learning_curves.png") + plt.close(fig) + + + print("...done!") + print(" ") + + print("Write model to file...") + + model.save(result_loc) + + print("...done!") + print(" ") + + # The loss dict should have two losses: one for training and one for validation. Each should + # be lists with n_epochs elements: + pass_dim_test = False + if len(loss_dict['loss']) == len(loss_dict['val_loss']): + pass_dim_test = True + + self.assertTrue(pass_dim_test) + #***************************************** + + +# Run this file via: python utest_keras_cnn_ae.py +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/utests/utest_learning_curve_visualizer.py b/utests/utest_learning_curve_visualizer.py new file mode 100644 index 0000000..ee79790 --- /dev/null +++ b/utests/utest_learning_curve_visualizer.py @@ -0,0 +1,81 @@ +import jlab_datascience_toolkit.analyses as analyses +import numpy as np +import unittest +import matplotlib.pyplot as plt +import os + +class UTestLearningCurveVisualizer(unittest.TestCase): + + # Initialize: + #***************************************** + def __init__(self,*args, **kwargs): + super(UTestLearningCurveVisualizer,self).__init__(*args, **kwargs) + + # Get an intro: + print(" ") + print("*******************************************") + print("* *") + print("* Learning Curve Visualizer Unit-Test *") + print("* *") + print("*******************************************") + print(" ") + #***************************************** + + # Test everything: + #***************************************** + def test_learning_curve_visualizer(self): + + + # Create mock data: + print("Create test data...") + x = np.linspace(0,11,11) + + test_data = { + 'data_1':0.9*x, + 'data_2':1.1*x, + 'data_3':x*x, + 'data_4':0.5*x*x + } + + print("...done!") + print(" ") + + + # Load the analyzer: + print("Load analyzer module...") + + # We need the name: + module_id = "LearningCurveVisualizer_v0" + + # Get the path to the default configuration: + this_file_loc = os.path.dirname(__file__) + cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/learning_curve_visualizer_cfg.yaml') + + # Specify the name of the output file: + user_cfg = { + 'output_loc': 'results_utest_learning_curve_visualizer', + 'plots':{'plot_a':['data_1','data_2'],'plot_b':['data_3','data_4']}, + 'plot_labels':{'plot_a':['Trial','Some Value'],'plot_b':['Trial','Different Values']}, + 'plot_legends':{'plot_a':['Data 1','Data 2'],'plot_b':['Data 3','Data 4']}, + 'plot_names':{'plot_a':'first_plot','plot_b':'second_plot'} + } + + analyzer = analyses.make(module_id,path_to_cfg=cfg_loc,user_config=user_cfg) + + print("...done!") + print(" ") + + print("Visualize test data...") + + analyzer.run(test_data) + + print("..done! Have a wonderful day!") + print(" ") + + + #***************************************** + + +# Run this file via: python utest_learning_curve_visualizer.py +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/utests/utest_mnist_data_parser.py b/utests/utest_mnist_data_parser.py new file mode 100644 index 0000000..2640209 --- /dev/null +++ b/utests/utest_mnist_data_parser.py @@ -0,0 +1,115 @@ +import jlab_datascience_toolkit.data_parsers as parsers +import unittest +import numpy as np +import matplotlib.pyplot as plt +import os + +class UTestMNISTDataParser(unittest.TestCase): + + # Initialize: + #***************************************** + def __init__(self,*args, **kwargs): + super(UTestMNISTDataParser,self).__init__(*args, **kwargs) + + # Get an into: + print(" ") + print("***********************************") + print("* *") + print("* MNIST Data Parser Unit-Test *") + print("* *") + print("***********************************") + print(" ") + #***************************************** + + # test the parser: + #***************************************** + def test_mnist_data_parser(self): + print("Load MNIST data parser...") + + # Module name: + module_id = "MNISTDataParser_v0" + + # Location for the default settings: + this_file_loc = os.path.dirname(__file__) + cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/mnist_data_parser_cfg.yaml') + + # User settings: + user_cfg = { + 'train_data_percentage': 0.75, + 'test_data_percentage': 0.1, + 'use_labels':True + } + + mnist_parser = parsers.make(module_id,path_to_cfg=cfg_loc,user_config=user_cfg) + + print("...done!") + print(" ") + + # Get the data: + print("Parse MNIST data...") + + mnist_data = mnist_parser.load_data() + + print("...done!") + print(" ") + + # We simply test if the 0-dim of the returned data and the stored labels are the same + # --> They should by construction... + print("Run small sanity check...") + + n_data = mnist_data.shape[0] + n_labels = mnist_parser.mnist_labels.shape[0] + + pass_dim_check = False + if n_data == n_labels: + pass_dim_check = True + + print("...done!") + print(" ") + + print("Generate plots for consistency checks..") + # Create some plots that show that this thing is working: + test_labels = [0,2,7] + n_acc = 3 + + fig,ax = plt.subplots(n_acc,n_acc,figsize=(18,8)) + counter = 0 + #+++++++++++++++++++++++ + for label in test_labels: + cond = (mnist_parser.mnist_labels == label) + current_data = mnist_data[cond] + + idx_acc = np.random.choice(current_data.shape[0],n_acc) + acc_data = current_data[idx_acc] + + ax[counter,0].set_title('Label = ' + str(label)) + ax[counter,0].imshow(acc_data[0]) + ax[counter,0].set_axis_off() + + ax[counter,1].set_title('Label = ' + str(label)) + ax[counter,1].imshow(acc_data[1]) + ax[counter,1].set_axis_off() + + ax[counter,2].set_title('Label = ' + str(label)) + ax[counter,2].imshow(acc_data[2]) + ax[counter,2].set_axis_off() + + counter += 1 + #+++++++++++++++++++++++ + + output_loc = 'results_utest_mnist_data_parser' + os.makedirs(output_loc,exist_ok=True) + + fig.savefig(output_loc+"/mnist_data_plots.png") + plt.close(fig) + + print("...done! Have a great day!") + print(" ") + + self.assertTrue(pass_dim_check) + #***************************************** + + +# Run this file via: python utest_mnist_data_parser.py +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/utests/utest_numpy_linear_scaler.py b/utests/utest_numpy_linear_scaler.py new file mode 100644 index 0000000..76a03e0 --- /dev/null +++ b/utests/utest_numpy_linear_scaler.py @@ -0,0 +1,172 @@ +import jlab_datascience_toolkit.data_preps as preps +import unittest +import numpy as np +import matplotlib.pyplot as plt +import os +import shutil + +class UTestNumpyLinearScaler(unittest.TestCase): + + # Initialize: + #***************************************** + def __init__(self,*args, **kwargs): + super(UTestNumpyLinearScaler,self).__init__(*args, **kwargs) + + # Get an into: + print(" ") + print("**************************************") + print("* *") + print("* Unit Test: Numpy Linear Scaler *") + print("* *") + print("**************************************") + print(" ") + #***************************************** + + # Test the min max scaler: + #***************************************** + def test_drive_numpy_linear_scaler(self): + # Create some data first, that we wish to scale: + print("Create test data...") + + test_data = np.random.uniform(5.0,10.0,size=(5000,1)) + + print("...done!") + print(" ") + + #Now load the scaler by defining a user config first: + print("Load numpy linear scaler...") + + this_file_loc = os.path.dirname(__file__) + cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/numpy_linear_scaler_cfg.yaml') + param_store_loc = this_file_loc + '/numpy_linear_scaler_params' + scaler_cfg = {'A':2,'B':-10,'store_loc':param_store_loc,'exclude_data':['test0']} + npy_scaler = preps.make("NumpyLinearScaler_v0",path_to_cfg=cfg_loc,user_config=scaler_cfg) + + # Print the module info: + npy_scaler.get_info() + + print("...done!") + print(" ") + + # Run the scaler: + print("Scale data...") + + scaled_data = npy_scaler.run(test_data) + + print("...done!") + print(" ") + + # Undo the scaling: + print("Reverse scaling...") + + unscaled_data = npy_scaler.reverse(scaled_data) + + print("...done!") + print(" ") + + # Check if the data ranges make sense at all: + pass_range_check_1 = False + pass_range_check_2 = False + + print("Run sanity checks...") + + # Check scaled data: + if round(np.min(scaled_data),1) == 0.0 and round(np.max(scaled_data)) == 10.0: + pass_range_check_1 = True + + # Check if the unscaled data has the same limits as the original test data: + if round(np.min(test_data),1) == round(np.min(unscaled_data),1) and round(np.max(test_data),1) == round(np.max(unscaled_data),1): + pass_range_check_2 = True + + print("...done!") + print(" ") + + # Repeat analysis, but use data dictionaries instead: + print("Create dictionary with test data...") + + dict_data = { + 'test0':np.zeros_like(test_data), + 'test1':test_data + } + + print("...done!") + print(" ") + + print("Pass dictionary through linear scaler...") + + scaled_dict_data = npy_scaler.run(dict_data) + + print("...done!") + print(" ") + + print("And reverse everything...") + + unscaled_dict_data = npy_scaler.reverse(scaled_dict_data) + + print("...done!") + print(" ") + + # Run another sanity check on the dictionary data: + print("Run another dimension check...") + + pass_dict_range_check_1 = False + pass_dict_range_check_2 = False + + # Check scaled data: + if round(np.min(scaled_dict_data['test1']),1) == 0.0 and round(np.max(scaled_dict_data['test1'])) == 10.0: + pass_dict_range_check_1 = True + + # Check if the unscaled data has the same limits as the original test data: + if round(np.min(dict_data['test1']),1) == round(np.min(unscaled_dict_data['test1']),1) and round(np.max(dict_data['test1']),1) == round(np.max(unscaled_dict_data['test1']),1): + pass_dict_range_check_2 = True + + + print("...done!") + print(" ") + + # Store and load the scaler parameters --> We want to see that the module checkpointing is working + print("Store and retreive scaler parameters...") + + pass_checkpointing = False + # Store the params: + npy_scaler.save() + + # And read them back in: + param_dict = npy_scaler.load() + + # If everything went right, there should be a file with scaling parameters and the param dictionary + # should not be empty: + if os.path.exists(scaler_cfg['store_loc']) and bool(param_dict): + pass_checkpointing = True + + print("...done!") + print(" ") + + # Clean up: + print("Remove created data...") + + shutil.rmtree('numpy_linear_scaler_params') + + print("...done!") + print(" ") + + # Test if the type checker is working: + passTypeChecker = False + print("Test type checker (an error message should show up below this line)...") + + val = npy_scaler.run([1,2,3,4]) + + if val is None: + passTypeChecker = True + + print("...done!") + print(" ") + + self.assertTrue(pass_range_check_1 & pass_range_check_2 & pass_checkpointing & passTypeChecker & pass_dict_range_check_1 & pass_dict_range_check_2) + + print("Have a great day!") + #***************************************** + +# Run this file via: python utest_numpy_parser.py +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/utests/utest_numpy_minmax_scaler.py b/utests/utest_numpy_minmax_scaler.py index d00fcb3..23176af 100644 --- a/utests/utest_numpy_minmax_scaler.py +++ b/utests/utest_numpy_minmax_scaler.py @@ -1,4 +1,4 @@ -import jlab_datascience_toolkit.data_prep as preps +import jlab_datascience_toolkit.data_preps as preps import unittest import numpy as np import matplotlib.pyplot as plt diff --git a/utests/utest_numpy_parser.py b/utests/utest_numpy_parser.py index b82c7da..27447c4 100644 --- a/utests/utest_numpy_parser.py +++ b/utests/utest_numpy_parser.py @@ -1,4 +1,4 @@ -import jlab_datascience_toolkit.data_parser as parsers +import jlab_datascience_toolkit.data_parsers as parsers import unittest import numpy as np import matplotlib.pyplot as plt diff --git a/utests/utest_pandas_parser_v0.py b/utests/utest_pandas_parser_v0.py index 1d7b5ad..fc4b86b 100644 --- a/utests/utest_pandas_parser_v0.py +++ b/utests/utest_pandas_parser_v0.py @@ -1,4 +1,4 @@ -from jlab_datascience_toolkit.data_parser import make +from jlab_datascience_toolkit.data_parsers import make import unittest import matplotlib.pyplot as plt import pandas as pd diff --git a/utests/utest_residual_analyzer.py b/utests/utest_residual_analyzer.py new file mode 100644 index 0000000..1cbdc73 --- /dev/null +++ b/utests/utest_residual_analyzer.py @@ -0,0 +1,96 @@ +import jlab_datascience_toolkit.analyses as analyses +from jlab_datascience_toolkit.utils.get_mnist import get_mnist_data +import numpy as np +import unittest +import matplotlib.pyplot as plt +import os + +class UTestResidualAnalyzer(unittest.TestCase): + + # Initialize: + #***************************************** + def __init__(self,*args, **kwargs): + super(UTestResidualAnalyzer,self).__init__(*args, **kwargs) + + # Get an intro: + print(" ") + print("***********************************") + print("* *") + print("* Residual Analyzer Unit-Test *") + print("* *") + print("***********************************") + print(" ") + + # First,get the MNIST data set offline or online: + print("Get MNIST data...") + + data, _, _, _ = get_mnist_data() + + print("...done!") + print(" ") + + # Do some minor preprocessing: + print("Preprocess data...") + + data = data.reshape((data.shape[0], 28, 28, 1)) / 255. + self.data = np.where(data > .5, 1.0, 0.0).astype('float32') + + print("...done!") + print(" ") + #***************************************** + + # Test everything: + #***************************************** + def test_drive_analyzer(self): + # Create test data: + print("Create test data...") + + # Do not use the entire MNIST data: + idx = np.random.choice(self.data.shape[0],20) + acc_data = self.data[idx] + + # Smearing factor: + f_smear = 5.0 + data_smeared = acc_data * np.random.normal(loc=1.0,scale=f_smear,size=acc_data.shape) + + test_data = { + 'x_real': acc_data, + 'x_rec': data_smeared + } + + print("...done!") + print(" ") + + # Load the analyzer: + print("Load analyzer module...") + + # We need the name: + module_id = "ResidualAnalyzer_v0" + + # Get the path to the default configuration: + this_file_loc = os.path.dirname(__file__) + cfg_loc = os.path.join(this_file_loc,'../jlab_datascience_toolkit/cfgs/defaults/residual_analyzer_cfg.yaml') + + # Specify the name of the output file: + user_cfg = { + 'output_loc': 'results_utest_residual_analyzer' + } + + analyzer = analyses.make(module_id,path_to_cfg=cfg_loc,user_config=user_cfg) + + print("...done!") + print(" ") + + # Analyze test data: + print("Analyze the test data...") + + analyzer.run(test_data) + + print("...done! Have a wonderful day!") + print(" ") + #***************************************** + + +# Run this file via: python utest_residual_analyzer.py +if __name__ == "__main__": + unittest.main() \ No newline at end of file