diff --git a/.gitignore b/.gitignore index dea0b60..ced95ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,8 @@ mlruns/ scratch/ dataset/ -data/ plots/ +data/ docs/source diff --git a/nmrcraft/analysis/plotting.py b/nmrcraft/analysis/plotting.py index 9acdcfb..5f78b15 100644 --- a/nmrcraft/analysis/plotting.py +++ b/nmrcraft/analysis/plotting.py @@ -1,3 +1,4 @@ +import matplotlib.patches as mpatches import matplotlib.pyplot as plt import numpy as np from cycler import cycler @@ -15,11 +16,13 @@ def style_setup(): plt.rcParams["text.latex.preamble"] = r"\usepackage{sansmathfonts}" plt.rcParams["axes.prop_cycle"] = cycler(color=colors) - # Use the first color from the custom color cycle - first_color = plt.rcParams["axes.prop_cycle"].by_key()["color"][0] + all_colors = [ + plt.rcParams["axes.prop_cycle"].by_key()["color"][i] + for i in range(len(colors)) + ] plt.rcParams["text.usetex"] = False - return cmap, colors, first_color + return cmap, colors, all_colors def plot_predicted_vs_ground_truth( @@ -33,7 +36,8 @@ def plot_predicted_vs_ground_truth( Returns: None """ - _, _, first_color = style_setup() + _, _, colors = style_setup() + first_color = colors[0] # Creating the plot plt.figure(figsize=(10, 8)) plt.scatter(y_test, y_pred, color=first_color, edgecolor="k", alpha=0.6) @@ -167,3 +171,75 @@ def plot_roc_curve(fpr, tpr, roc_auc, title, path): plt.legend(loc="lower right") plt.savefig(path) plt.close() + + +def plot_with_without_ligands_bar(df): + categories = df["target"].unique() + _, _, colors = style_setup() + first_color = colors[0] + second_color = colors[1] + + # Extract data + + x_pos = np.arange(len(categories)) + bar_width = 0.35 + + # Initialize plot + fig, ax = plt.subplots() + + # Loop through each category and plot bars + for i, category in enumerate(categories): + subset = df[df["target"] == category] + + # Means and error bars + means = subset["accuracy_mean"].values + errors = [ + subset["accuracy_mean"].values + - subset["accuracy_lower_bd"].values, + subset["accuracy_upper_bd"].values + - subset["accuracy_mean"].values, + ] + + # Bar locations for the group + bar_positions = x_pos[i] + np.array([-bar_width / 2, bar_width / 2]) + + # Determine bar colors based on 'nmr_tensor_input_only' field + bar_colors = [ + first_color if x else second_color + for x in subset["nmr_tensor_input_only"] + ] + + # Plotting the bars + ax.bar( + bar_positions, + means, + yerr=np.array(errors), + color=bar_colors, + align="center", + ecolor="black", + capsize=5, + width=bar_width, + ) + + # Labeling and aesthetics + ax.set_ylabel("Accuracy / %") + ax.set_xlabel("Target(s)") + ax.set_xticks(x_pos) + ax.set_xticklabels(categories) + ax.set_title("Accuracy Measurements with Error Bars") + + handles = [ + mpatches.Patch(color=first_color, label="With Ligand Info"), + mpatches.Patch(color=second_color, label="Without Ligand Info"), + ] + ax.legend(handles=handles, loc="best", fontsize=20) + plt.tight_layout() + plt.savefig("plots/exp3_incorporate_ligand_info.png") + print("Saved to plots/exp3_incorporate_ligand_info.png") + + +if __name__ == "main": + import pandas as pd + + df = pd.read_csv("dataset/path_to_results.csv") + plot_with_without_ligands_bar(df) diff --git a/nmrcraft/data/data_utils.py b/nmrcraft/data/data_utils.py new file mode 100644 index 0000000..ea28bf1 --- /dev/null +++ b/nmrcraft/data/data_utils.py @@ -0,0 +1,76 @@ +"""Load and preprocess data.""" + +import os + +import pandas as pd +from datasets import load_dataset + + +class DatasetLoadError(FileNotFoundError): + """Exeption raised when the Dataloader could not find data/dataset.csv, + even after trying to generate it from huggingface""" + + def __init__(self, t): + super().__init__(f"Could not load raw Dataset '{t}'") + + +class InvalidTargetError(ValueError): + """Exception raised when the specified model name is not found.""" + + def __init__(self, t): + super().__init__(f"Invalid target '{t}'") + + +def filename_to_ligands(dataset: pd.DataFrame): + """ + Extract ligands from the filename and add as columns to the dataset. + Assumes that filenames are structured in a specific way that can be parsed into ligands. + """ + filename_parts = dataset["file_name"].str.split("_", expand=True) + dataset["metal"] = filename_parts.get(0) + dataset["geometry"] = filename_parts.get(1) + dataset["E_ligand"] = filename_parts.get(2) + dataset["X1_ligand"] = filename_parts.get(3) + dataset["X2_ligand"] = filename_parts.get(4) + dataset["X3_ligand"] = filename_parts.get(5) + dataset["X4_ligand"] = filename_parts.get(6) + dataset["L_ligand"] = filename_parts.get(7).fillna( + "none" + ) # Fill missing L_ligand with 'none' + return dataset + + +def load_dummy_dataset_locally(datset_path: str = "tests/data.csv"): + dataset = pd.read_csv(datset_path) + return dataset + + +def load_dataset_from_hf( + dataset_name: str = "NMRcraft/nmrcraft", data_files: str = "all_no_nan.csv" +): + """Load the dataset. + + This function loads the dataset using the specified dataset name and data files. + It assumes that you have logged into the Hugging Face CLI prior to calling this function. + + Args: + dataset_name (str, optional): The name of the dataset. Defaults to "NMRcraft/nmrcraft". + data_files (str, optional): The name of the data file. Defaults to 'all_no_nan.csv'. + + Returns: + pandas.DataFrame: The loaded dataset as a pandas DataFrame. + """ + # Create data dir if needed + if not os.path.isdir("dataset"): + os.mkdir("dataset") + # Check if hf dataset is already downloaded, else download it and then load it + if not os.path.isfile("dataset/dataset.csv"): + dataset = load_dataset(dataset_name, data_files=data_files)[ + "train" + ].to_pandas() + dataset.to_csv("dataset/dataset.csv") + if os.path.isfile("dataset/dataset.csv"): + dataset = pd.read_csv("dataset/dataset.csv") + elif not os.path.isfile("dataset/dataset.csv"): + raise DatasetLoadError(FileNotFoundError) + return dataset diff --git a/nmrcraft/data/dataloader.py b/nmrcraft/data/dataloader.py new file mode 100644 index 0000000..189c364 --- /dev/null +++ b/nmrcraft/data/dataloader.py @@ -0,0 +1,210 @@ +"""Load and preprocess data.""" + +from typing import Any, List, Tuple + +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import ( + LabelEncoder, + StandardScaler, +) + +from nmrcraft.data.data_utils import ( + filename_to_ligands, + load_dataset_from_hf, + load_dummy_dataset_locally, +) +from nmrcraft.utils.set_seed import set_seed + +set_seed() + +TARGET_TYPES = [ + "metal", + "X1_ligand", + "X2_ligand", + "X3_ligand", + "X4_ligand", + "L_ligand", + "E_ligand", +] + + +class DataLoader: + def __init__( + self, + feature_columns: Any, + target_columns: str, + complex_geometry: str, + test_size: float, + random_state: int, + dataset_size: float, + include_structural_features: bool, + testing: bool, + ): + self.feature_columns = feature_columns + self.test_size = test_size + self.random_state = random_state + self.dataset_size = dataset_size + self.target_columns = target_columns + self.complex_geometry = complex_geometry + self.include_structural_features = include_structural_features + + if not testing: + self.dataset = load_dataset_from_hf() + elif testing: + self.dataset = load_dummy_dataset_locally() + + def load_data(self) -> pd.DataFrame: + """ + Loads the dataset, preprocesses it, and returns the preprocessed data. + + Returns: + Preprocessed data (pandas.DataFrame): The preprocessed dataset. + """ + self.dataset = filename_to_ligands(self.dataset) + self.dataset = self.dataset.sample(frac=self.dataset_size) + self.choose_geometry() + return self.split_and_preprocess() + + def choose_geometry(self) -> None: + """ + Filters the dataset based on the complex geometry. + + This method filters the dataset based on the complex geometry specified by the `complex_geometry` attribute. + It checks if the specified geometry is valid and then updates the dataset accordingly. If the geometry is not + valid, a `ValueError` is raised. + + Raises: + ValueError: If the specified geometry is not valid. + + """ + valid_geometries = {"oct", "spy", "tbp"} + if self.complex_geometry in valid_geometries: + self.dataset = self.dataset[ + self.dataset["geometry"] == self.complex_geometry + ] + # else: + # raise ValueError("Invalid geometry'.") FIXME + + def encode_categorical_features(self) -> np.ndarray: + """ + Encodes the categorical features in the dataset using LabelEncoder. + + Returns: + np.ndarray: The encoded features in numpy array format. + """ + # Select and extract the structural features from the dataset + structural_features = ( + self.dataset[ + [col for col in TARGET_TYPES if col not in self.target_columns] + ] + .to_numpy() + .T + ) # Transpose immediately after conversion to numpy + + # Encode features using LabelEncoder and store encoders for potential inverse transform + encoded_features = [] + self.encoders = [] # To store encoders for each feature + for features in structural_features: + encoder = LabelEncoder() + encoder.fit(features) + encoded_features.append(encoder.transform(features)) + self.encoders.append(encoder) + + # Convert the list of encoded features back to the original data structure + return np.array( + encoded_features + ).T # Transpose back to original orientation + + def encode_targets(self) -> Tuple[np.ndarray, dict]: + """ + Encodes the target variables in the dataset using LabelEncoder. + + Returns: + Tuple[np.ndarray, dict]: The encoded targets and a dictionary mapping target names to labels. + """ + # Initialize lists to store encoded targets and corresponding encoders + encoded_targets = [] + self.target_encoders = [] + y_labels_dict = {} + + # Encode each target column using LabelEncoder + for target_name in self.target_columns: + target = self.dataset[target_name].to_numpy() + encoder = LabelEncoder() + encoder.fit(target) + encoded_targets.append(encoder.transform(target)) + self.target_encoders.append(encoder) + y_labels_dict[ + target_name + ] = ( + encoder.classes_.tolist() + ) # Dictionary of labels for each target + + y_encoded = np.array( + encoded_targets + ).T # Transpose to match original data structure + return y_encoded, y_labels_dict + + def split_and_preprocess( + self, + ) -> Tuple[ + np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[List[str]] + ]: + """ + Split the dataset into training and testing sets, preprocess the data, and return the preprocessed data. + + Returns: + Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[List[str]]]: A tuple containing the preprocessed training and testing data, encoded target variables, and readable labels. + """ + # Extract and encode categorical features + X_NMR = self.dataset[self.feature_columns].to_numpy() + X_Structural = self.encode_categorical_features() + + # Encode target variables and store readable labels + ( + y_encoded, + y_labels, + ) = self.encode_targets() + + # Split data into training and testing sets + ( + X_train_NMR, + X_test_NMR, + X_train_Structural, + X_test_Structural, + y_train, + y_test, + ) = train_test_split( + X_NMR, + X_Structural, + y_encoded, + test_size=self.test_size, + random_state=self.random_state, + ) + + # Scale numerical features (the NMR tensor) + scaler = StandardScaler() + X_train_NMR_scaled = scaler.fit_transform(X_train_NMR) + X_test_NMR_scaled = scaler.transform(X_test_NMR) + + # Combine features if structural features are included + if self.include_structural_features: + X_train = np.concatenate( + [X_train_NMR_scaled, X_train_Structural], axis=1 + ) + X_test = np.concatenate( + [X_test_NMR_scaled, X_test_Structural], axis=1 + ) + else: + X_train = X_train_NMR_scaled + X_test = X_test_NMR_scaled + + return ( + X_train, + X_test, + np.squeeze(y_train), + np.squeeze(y_test), + y_labels, + ) diff --git a/nmrcraft/data/dataset.py b/nmrcraft/data/dataset.py deleted file mode 100644 index c179140..0000000 --- a/nmrcraft/data/dataset.py +++ /dev/null @@ -1,572 +0,0 @@ -"""Load and preprocess data.""" - -import itertools -import os - -import numpy as np -import pandas as pd -from datasets import load_dataset -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import ( - LabelBinarizer, - LabelEncoder, - OneHotEncoder, - StandardScaler, -) - -from nmrcraft.utils.set_seed import set_seed - -set_seed() - - -class DatasetLoadError(FileNotFoundError): - """Exeption raised when the Dataloader could not find data/dataset.csv, - even after trying to generate it from huggingface""" - - def __init__(self, t): - super().__init__(f"Could not load raw Dataset '{t}'") - - -class InvalidTargetError(ValueError): - """Exception raised when the specified model name is not found.""" - - def __init__(self, t): - super().__init__(f"Invalid target '{t}'") - - -class InvalidTargetTypeError(ValueError): - """Exception raised when the specified target type is not valid.""" - - def __init__(self, t): - super().__init__(f"Invalid target Type '{t}'") - - -def filename_to_ligands(dataset: pd.DataFrame): - """ - Extract ligands from the filename and add as columns to the dataset. - Assumes that filenames are structured in a specific way that can be parsed into ligands. - """ - filename_parts = dataset["file_name"].str.split("_", expand=True) - dataset["metal"] = filename_parts.get(0) - dataset["geometry"] = filename_parts.get(1) - dataset["E_ligand"] = filename_parts.get(2) - dataset["X1_ligand"] = filename_parts.get(3) - dataset["X2_ligand"] = filename_parts.get(4) - dataset["X3_ligand"] = filename_parts.get(5) - dataset["X4_ligand"] = filename_parts.get(6) - dataset["L_ligand"] = filename_parts.get(7).fillna( - "none" - ) # Fill missing L_ligand with 'none' - return dataset - - -def load_dummy_dataset_locally(datset_path: str = "tests/data.csv"): - dataset = pd.read_csv(datset_path) - return dataset - - -def load_dataset_from_hf( - dataset_name: str = "NMRcraft/nmrcraft", data_files: str = "all_no_nan.csv" -): - """Load the dataset. - - This function loads the dataset using the specified dataset name and data files. - It assumes that you have logged into the Hugging Face CLI prior to calling this function. - - Args: - dataset_name (str, optional): The name of the dataset. Defaults to "NMRcraft/nmrcraft". - data_files (str, optional): The name of the data file. Defaults to 'all_no_nan.csv'. - - Returns: - pandas.DataFrame: The loaded dataset as a pandas DataFrame. - """ - # Create data dir if needed - if not os.path.isdir("data"): - os.mkdir("data") - # Check if hf dataset is already downloaded, else download it and then load it - if not os.path.isfile("data/dataset.csv"): - dataset = load_dataset(dataset_name, data_files=data_files)[ - "train" - ].to_pandas() - dataset.to_csv("data/dataset.csv") - if os.path.isfile("data/dataset.csv"): - dataset = pd.read_csv("data/dataset.csv") - elif not os.path.isfile("data/dataset.csv"): - raise DatasetLoadError(FileNotFoundError) - return dataset - - -def transpose(array: any): - """rotate/transpose array to the right""" - ar = array[:] # make copy just to be sure - ar = [ # rotate the array to the right - list(x) if i == 0 else x for i, x in enumerate(map(list, zip(*ar))) - ] - return ar - - -def get_target_columns(target_columns: str): - """ - Function takes target columns in underline format f.e 'metal_X1_X4_X2_L' and - transforms into a list of the column names present in the dataset. - """ - TARGET_TYPES = ["metal", "X1", "X2", "X3", "X4", "L", "E"] - - # Split the target string into individual targets - targets = [t.strip() for t in target_columns.split("_")] - - # Check if the targets are valid - for t in targets: - if t not in TARGET_TYPES: - raise InvalidTargetError(t) - - # Translate them into Dataframe Column names - target_map = { - "metal": "metal", - "X1": "X1_ligand", - "X2": "X2_ligand", - "X3": "X3_ligand", - "X4": "X4_ligand", - "L": "L_ligand", - "E": "E_ligand", - } - targets_transformed = [target_map[t] for t in targets] - - return targets_transformed - - -def get_structural_feature_columns(target_columns: list): - """ - Function gets the feature columns given the target columns. The feature columns are those that will be in the X set. - """ - TARGET_TYPES = [ - "metal", - "X1_ligand", - "X2_ligand", - "X3_ligand", - "X4_ligand", - "L_ligand", - "E_ligand", - ] - - # Get the features as the not targets - features = [x for x in TARGET_TYPES if x not in target_columns] - - return features - - -def target_label_readabilitizer(readable_labels): - """ - function takes in the classes from the binarzier and turns them into human readable list of same length of the target. - """ - # Trun that class_ into list - human_readable_label_list = list(itertools.chain(*readable_labels)) - # Handle Binarized metal stuff and make the two columns become a single one because the metals get turned into a single column by the binarizer - for i in enumerate(human_readable_label_list): - if ( - human_readable_label_list[i[0]] == "Mo" - and human_readable_label_list[i[0] + 1] == "W" - ) or ( - human_readable_label_list[i[0]] == "W" - and human_readable_label_list[i[0] + 1] == "Mo" - ): - human_readable_label_list[i[0]] = "Mo W" - human_readable_label_list.pop(i[0] + 1) - - return human_readable_label_list - - -def target_label_readabilitizer_categorical(target_labels): - good_labels = [] - for label_array in target_labels: - good_labels.append(list(label_array)) - return good_labels - - -def column_length_to_indices(column_lengths): - indices = [] - start_index = 0 - for length in column_lengths: - if length == 1: - indices.append([start_index]) - else: - indices.append(list(range(start_index, start_index + length))) - start_index += length - return indices - - -class DataLoader: - def __init__( - self, - dataset_name="NMRcraft/nmrcraft", - data_files="all_no_nan.csv", - feature_columns=None, - target_columns="metal", - target_type="one-hot", # can be "categorical" or "one-hot", - complex_geometry="all", - test_size=0.3, - random_state=42, - dataset_size=0.01, - include_structural_features=True, - testing=False, - ): - self.feature_columns = feature_columns - self.target_columns = get_target_columns(target_columns=target_columns) - self.test_size = test_size - self.random_state = random_state - self.dataset_size = dataset_size - self.target_type = target_type - self.complex_geometry = complex_geometry - self.include_structural_features = include_structural_features - - if not testing: - self.dataset = load_dataset_from_hf() - elif testing: - self.dataset = load_dummy_dataset_locally() - - def load_data(self): - self.dataset = filename_to_ligands(self.dataset) - self.dataset = self.dataset.sample(frac=self.dataset_size) - self.choose_geometry() - if self.target_type == "categorical": - return self.split_and_preprocess_categorical() - elif ( - self.target_type == "one-hot" - ): # Target is binarized and Features are one hot - return self.split_and_preprocess_one_hot() - else: - raise InvalidTargetTypeError(ValueError) - - def choose_geometry(self): - """ - Reduce the dataset down to a certain geometry if a valid - one was passed, else just leave it as is. - """ - if self.complex_geometry == "oct": - self.dataset = self.dataset[ - self.dataset["geometry"] == "oct" - ] # only load octahedral complexes - elif self.complex_geometry == "spy": - self.dataset = self.dataset[ - self.dataset["geometry"] == "spy" - ] # only load square pyramidal complexes - elif self.complex_geometry == "tbp": - self.dataset = self.dataset[ - self.dataset["geometry"] == "tbp" - ] # only load trigonal bipyramidal complexes - - def scale(self, X): - """ - Apply standard normalization to the feature set. - """ - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - return X_scaled - - def get_target_columns_separated(self): - """Returns the column indicies of the target array nicely sorted. - For example: metal_X1: [[0, 1], [1, 2, 3, 4]]""" - if ( - "metal" in self.target_columns - ): # If targets have metal, do weird stuff - metal_index = self.target_columns.index("metal") - y_column_indices = column_length_to_indices( - self.target_column_numbers - ) - for i in range(len(y_column_indices)): - if i == metal_index: - y_column_indices[i].append(y_column_indices[i][0] + 1) - if i > metal_index: - y_column_indices[i] = [x + 1 for x in y_column_indices[i]] - - elif "metal" not in self.target_columns: - y_column_indices = column_length_to_indices( - self.target_column_numbers - ) - return y_column_indices - - def more_than_one_target(self): - """Function returns true if more than one target is specified""" - return len(self.target_columns) > 1 - - def categorical_target_decoder(self, y): - """ - function takes in the target (y) array and transforms it back to decoded form. - For this function to be run the split_and_preprocess_categorical already has to have been run beforehand. - """ - ys = y[:] # copy y so it's not modified - target_encoders = self.target_label_encoders - ys_decoded = [] - ys = transpose(ys) - - # Decode columnwise - for i, target_column in enumerate(ys): - ys_decoded.append( - target_encoders[i].inverse_transform(target_column) - ) - - # Rotate back so each row corresponds to a complex and not the target like metal or X4 - ys_decoded_properly_rotated = [ - list(x) if i == 0 else x - for i, x in enumerate(map(list, zip(*ys_decoded))) - ] - - return np.array(ys_decoded_properly_rotated) - - def binarized_target_decoder(self, y): - """ - function takes in the target (y) array and transforms it back to decoded form. - For this function to be run the one-hot-preprocesser already has to have been run beforehand. - """ - y_column_indices = column_length_to_indices(self.target_column_numbers) - ys = [] - ys_decoded = [] - # Split up compressed array into the categories - for i in range(len(y_column_indices)): - ys.append(y[:, y_column_indices[i]]) - - # Decode the binarized categries using the original binarizers - for i in range(len(ys)): - ys_decoded.append(self.encoders[i].inverse_transform(ys[i])) - - # Rotate the array - ys_decoded_properly_rotated = [ - list(x) if i == 0 else x - for i, x in enumerate(map(list, zip(*ys_decoded))) - ] - return ys_decoded_properly_rotated - - def confusion_matrix_data_adapter_categorical(self, y): - """ - Takes in binary encoded target array and returns decoded flat list. - Especially designed to work with confusion matrix. - """ - y_decoded = self.categorical_target_decoder(y) - flat_y_decoded = [y for ys in y_decoded for y in ys] - return flat_y_decoded - - def confusion_matrix_data_adapter_one_hot(self, y): - """ - Takes in binary encoded target array and returns decoded flat list. - Especially designed to work with confusion matrix. - """ - y_decoded = self.binarized_target_decoder(y) - flat_y_decoded = [y for ys in y_decoded for y in ys] - return flat_y_decoded - - def confusion_matrix_label_adapter(self, y_labels): - y_labels_copy = y_labels[:] - for i in range(len(y_labels)): - if y_labels_copy[i] == "Mo W": - y_labels_copy[i] = "Mo" - y_labels_copy.insert(i, "W") - return y_labels_copy - - def categorical_endocode_X(self): - # Get NMR Featrues (passed ones) and structural Features - X_Structural_Features_Columns = get_structural_feature_columns( - target_columns=self.target_columns - ) - X_Structural_Features = self.dataset[ - X_Structural_Features_Columns - ].to_numpy() - - # Transpose the array - X_Structural_Features = transpose(X_Structural_Features) - - # Target-wise encoding with Label encoder and save encoders for later decoding - xs = [] - for i in range(len(X_Structural_Features)): - tmp_encoder = LabelEncoder() - tmp_encoder.fit(X_Structural_Features[i]) - xs.append(tmp_encoder.transform(X_Structural_Features[i])) - X_Structural_Features = list(zip(*xs)) # Kind of backtransposing - - return X_Structural_Features - - def categorical_endocode_y(self): - # Get the targets - y_labels_rotated = self.dataset[self.target_columns].to_numpy() - - # rotate the list of list (array-like) - y_labels = transpose(y_labels_rotated) - - # Do targetwise encoding using the label encoder and save the label encoders for later decoding - ys = [] - self.target_label_encoders = [] - readable_labels = [] - for i in range(len(y_labels)): - tmp_encoder = LabelEncoder() - tmp_encoder.fit(y_labels[i]) - ys.append(tmp_encoder.transform(y_labels[i])) - self.target_label_encoders.append(tmp_encoder) - readable_labels.append(tmp_encoder.classes_) - # Combine y - y = np.array(list(zip(*ys))) - # Return y fuzed into a single array and y_labels - return y, readable_labels - - def one_hot_endocode_X(self): - """ - Method that does the one-hot encoding of the DataLoader's features - based on the selected targets - """ - # Get Columns corresponding to the features that are selected - X_Structural_Features_Columns = get_structural_feature_columns( - self.target_columns - ) - - # Get the features based on the selected columns - X_Structural_Features = self.dataset[ - X_Structural_Features_Columns - ].to_numpy() - - # One hot encode X structural - X_Structural_Features_enc = ( - OneHotEncoder().fit_transform(X_Structural_Features).toarray() - ) - - return X_Structural_Features_enc - - def label_binarize_endocode_y(self): - - # Get the Targets and transpose - y_labels_rotated = self.dataset[self.target_columns].to_numpy() - y_labels = transpose(y_labels_rotated) - - ys = [] - readable_labels = [] - self.encoders = [] - self.target_column_numbers = [] - - # Binarize targetwise and save encoders and labels - for i in range(len(y_labels)): - # Encode - label_binerizer = LabelBinarizer() - ys.append(label_binerizer.fit_transform(y_labels[i])) - - # Save stuff for later decoding - readable_labels.append(label_binerizer.classes_) - self.encoders.append( - label_binerizer - ) # save encoder for later decoding - self.target_column_numbers.append( - len(ys[i][0]) - ) # save column numbers for later decoding - - # Return y fuzed into a single array and labels - y = np.concatenate(list(ys), axis=1) - return y, readable_labels - - def split_and_preprocess_categorical(self): - """ - Split data into training and test sets, then apply normalization. - Ensures that the test data does not leak into training data preprocessing. - X and y are categorical, so each column has a integer that defines which one of the ligands is in the column. - """ - - # Get NMR features - X_NMR = self.dataset[self.feature_columns].to_numpy() - - # Encode X in a categorical fashion with the label encoder columnwise - X_Structural_Features = self.categorical_endocode_X() - - # Encode y in a categorical fashion with the label encoder columnwise - y, readable_labels = self.categorical_endocode_y() - - # Train Test splitting - ( - X_train_NMR, - X_test_NMR, - X_train_structural, - X_test_structural, - y_train, - y_test, - ) = train_test_split( - X_NMR, - X_Structural_Features, - y, - test_size=self.test_size, - random_state=self.random_state, - ) - - # Normalize features with no leakage from test set - X_train_NMR_scaled = self.scale(X_train_NMR) - X_test_NMR_scaled = self.scale(X_test_NMR) - - if self.include_structural_features: - # Combine scaled NMR features with structural features to get final X - X_train_scaled = np.concatenate( - [X_train_NMR_scaled, X_train_structural], axis=1 - ) - X_test_scaled = np.concatenate( - [X_test_NMR_scaled, X_test_structural], axis=1 - ) - else: - # Just have the NMR features as X - X_train_scaled = X_train_NMR_scaled - X_test_scaled = X_test_NMR_scaled - - # Get the target labels going - y_label = target_label_readabilitizer_categorical(readable_labels) - - return X_train_scaled, X_test_scaled, y_train, y_test, y_label - - def split_and_preprocess_one_hot(self): - """ - Split data into training and test sets, then apply normalization. - Ensures that the test data does not leak into training data preprocessing. Returned X is one-hot encoded and y binarized using the sklearn functions. - """ - # Get NMR features - X_NMR = self.dataset[self.feature_columns].to_numpy() - - # Get structural features one-hot encoded - X_Structural_Features_enc = self.one_hot_endocode_X() - - # Get structural targets, binarized - y, readable_labels = self.label_binarize_endocode_y() - - # Split the datasets - ( - X_train_NMR, - X_test_NMR, - X_train_structural, - X_test_structural, - y_train, - y_test, - ) = train_test_split( - X_NMR, - X_Structural_Features_enc, - y, - test_size=self.test_size, - random_state=self.random_state, - ) - - # Normalize features with no leakage from test set - X_train_NMR_scaled = self.scale(X_train_NMR) - X_test_NMR_scaled = self.scale(X_test_NMR) - - if self.include_structural_features: - # Combine scaled NMR features with structural features to get final X - X_train_scaled = np.concatenate( - [X_train_NMR_scaled, X_train_structural], axis=1 - ) - X_test_scaled = np.concatenate( - [X_test_NMR_scaled, X_test_structural], axis=1 - ) - else: - # Just have the NMR features as X - X_train_scaled = X_train_NMR_scaled - X_test_scaled = X_test_NMR_scaled - - # Creates the labels that can be used to identify the targets in the binaized y-array - # (basicall handle special metal behaviour) - good_target_labels = target_label_readabilitizer(readable_labels) - - return ( - X_train_scaled, - X_test_scaled, - y_train, - y_test, - good_target_labels, - ) diff --git a/nmrcraft/evaluation/evaluation.py b/nmrcraft/evaluation/evaluation.py index 0b4d1a3..7c30b6b 100644 --- a/nmrcraft/evaluation/evaluation.py +++ b/nmrcraft/evaluation/evaluation.py @@ -10,7 +10,7 @@ roc_curve, ) -from nmrcraft.data import dataset +from nmrcraft.data import dataloader def model_evaluation( @@ -18,7 +18,7 @@ def model_evaluation( X_test: Any, y_test: Any, y_labels: Any, - dataloader: dataset.DataLoader, + dataloader: dataloader.DataLoader, ) -> Tuple[Dict[str, float], Any, Any, Any]: """ Evaluate the performance of the trained machine learning model for 1D targets. @@ -67,7 +67,7 @@ def model_evaluation_nD( X_test: Any, y_test: Any, y_labels: Any, - dataloader: dataset.DataLoader, + dataloader: dataloader.DataLoader, ) -> Tuple[Dict[str, float], Any, Any, Any]: """ Evaluate the performance of the trained machine learning model for 2D+ Targets. diff --git a/nmrcraft/evaluation/visualizer.py b/nmrcraft/evaluation/visualizer.py index 9ad3a91..77a92f7 100644 --- a/nmrcraft/evaluation/visualizer.py +++ b/nmrcraft/evaluation/visualizer.py @@ -2,60 +2,119 @@ import os import matplotlib.pyplot as plt +import numpy as np +from cycler import cycler +from matplotlib.colors import LinearSegmentedColormap class Visualizer: - def __init__(self, model_name: str, data: None, folder_path: str): + def __init__( + self, + model_name: str, + cm: None, + rates=None, + metrics=None, + folder_path: str = "plots/", + classes=None, + dataset_size=None, + ): self.model_name = model_name - self.data = data + self.cm = cm + self.rates = (rates,) + self.metrics = metrics self.folder_path = folder_path + self.classes = classes + self.dataset_size = dataset_size + if not os.path.exists(folder_path): + os.makedirs(folder_path) - def plot_ROC( - self, title="ROC Curves by Dataset Size", filename="ROC_Curves.png" - ): - print(self.data.index) - plt.figure(figsize=(10, 8)) + def style_setup(): + """Function to set up matplotlib parameters.""" colors = [ - "blue", - "green", - "red", - "violet", - "orange", - "cyan", - ] # Colors for different dataset sizes - labels = [ - f"Dataset Size: {idx}" for idx in self.data.index - ] # Labels for legend + "#C28340", + "#854F2B", + "#61371F", + "#8FCA5C", + "#70B237", + "#477A1E", + ] + cmap = LinearSegmentedColormap.from_list("custom", colors) - for (index, row), color, label in zip( - self.data.iterrows(), colors, labels - ): - index = index + 1 - plt.plot( - row["fpr"], - row["tpr"], - label=f'{label} (AUC = {row["roc_auc"]:.2f})', - color=color, - ) + plt.style.use("./style.mplstyle") + plt.rcParams["text.latex.preamble"] = r"\usepackage{sansmathfonts}" + plt.rcParams["axes.prop_cycle"] = cycler(color=colors) + + # Use the first color from the custom color cycle + first_color = plt.rcParams["axes.prop_cycle"].by_key()["color"][0] + plt.rcParams["text.usetex"] = False + + return cmap, colors, first_color - plt.plot( - [0, 1], - [0, 1], - linestyle="--", - lw=2, - color="gray", - label="Chance", - alpha=0.8, + def plot_confusion_matrix(self, full=True, columns_set=False): + """ + Plots the confusion matrix. + Parameters: + - classes (list): List of classes for the axis labels. + - title (str): Title of the plot. + - full (bool): If true plots one big, else many smaller. + - columns_set (list of lists): contains all relevant indices. + Returns: + None + """ + + def normalize_row_0_1(row): + return (row - np.min(row)) / (np.max(row) - np.min(row)) + + file_path = os.path.join( + self.folder_path, + f"ConfusionMatrix_{self.model_name}_{self.dataset_size}.png", ) - plt.title(title) - plt.xlabel("False Positive Rate") - plt.ylabel("True Positive Rate") - plt.legend(loc="lower right") + # _, _, _ = self.style_setup() + if full: # Plot one big cm + plt.figure(figsize=(10, 8)) + plt.imshow( + self.cm.apply(normalize_row_0_1, axis=1), + interpolation="nearest", + cmap=plt.cm.Blues, + ) + plt.title("The Confusion Matrix") + plt.colorbar() + tick_marks = np.arange(len(self.classes)) + plt.xticks(tick_marks, self.classes, rotation=45) + plt.yticks(tick_marks, self.classes) + plt.tight_layout() + plt.ylabel("True label") + plt.xlabel("Predicted label") + plt.savefig(file_path) + plt.close() - file_path = os.path.join(self.folder_path, filename) - plt.savefig(file_path) - plt.close() # Close the plot to free up memory - return file_path + elif not full: # Plot many small cms of each target + cms = [] + for columns in columns_set: # Make list of confusion matrices + cms.append( + self.cm[ + slice(columns[0], columns[-1] + 1), + slice(columns[0], columns[-1] + 1), + ] + ) + fig, axs = plt.subplots(nrows=len(cms), figsize=(10, 8 * len(cms))) + for i, sub_cm in enumerate(cms): + sub_classes = self.classes[ + slice(columns_set[i][0], columns_set[i][-1] + 1) + ] + axs[i].imshow( + sub_cm, interpolation="nearest", cmap=plt.cm.Blues + ) + axs[i].set_title(f"Confusion Matrix {i+1}") + tick_marks = np.arange(len(sub_classes)) + axs[i].set_xticks(tick_marks) + axs[i].set_xticklabels(sub_classes, rotation=45) + axs[i].set_yticks(tick_marks) + axs[i].set_yticklabels(sub_classes) + plt.tight_layout() + # plt.savefig(path) + plt.close() + return file_path def plot_metric( self, diff --git a/nmrcraft/models/classifier.py b/nmrcraft/models/classifier.py index cfd4868..3688d39 100644 --- a/nmrcraft/models/classifier.py +++ b/nmrcraft/models/classifier.py @@ -4,15 +4,14 @@ import pandas as pd from sklearn.metrics import ( accuracy_score, - auc, + confusion_matrix, f1_score, - # confusion_matrix, - multilabel_confusion_matrix, - roc_curve, + precision_score, + recall_score, ) from sklearn.utils import resample -from nmrcraft.data.dataset import DataLoader +from nmrcraft.data.dataloader import DataLoader from nmrcraft.models.model_configs import model_configs from nmrcraft.models.models import load_model from nmrcraft.training.hyperparameter_tune import HyperparameterTuner @@ -26,7 +25,11 @@ def __init__( target: str, dataset_size: float, feature_columns=None, - random_state=None, + random_state=42, + include_structural_features=True, + complex_geometry="oct", + test_size=0.2, + testing=False, ): if not feature_columns: feature_columns = [ @@ -49,17 +52,23 @@ def __init__( max_evals=self.max_evals, ) # algo is set to default value, TODO: change this in declaration of Classifier is necessary + data_loader = DataLoader( + feature_columns=feature_columns, + target_columns=target, + dataset_size=dataset_size, + include_structural_features=include_structural_features, + complex_geometry=complex_geometry, + test_size=test_size, + random_state=random_state, + testing=testing, + ) ( self.X_train, self.X_test, self.y_train, self.y_test, self.y_labels, - ) = DataLoader( - feature_columns=feature_columns, - target_columns=target, - dataset_size=dataset_size, - ).load_data() + ) = data_loader.load_data() def hyperparameter_tune(self): log.info( @@ -90,11 +99,11 @@ def train_bootstraped(self, n_times=10): replace=True, random_state=self.random_state, ) - self.hyperparameter_tune() + # self.hyperparameter_tune() self.train() - eval_data = self.evaluate() - accuracy.append(eval_data["accuracy"]) - f1_score.append(eval_data["f1_score"]) + rates_df, metrics, cm = self.evaluate() + accuracy.append(metrics["Accuracy"]) + f1_score.append(metrics["F1"]) i += 1 new_row = { "accuracy": np.mean(accuracy), @@ -106,36 +115,65 @@ def train_bootstraped(self, n_times=10): } return pd.DataFrame([new_row]) - def evaluate(self) -> pd.DataFrame(): + def evaluate(self) -> pd.DataFrame: """ Evaluate the performance of the trained machine learning model. Returns: - Tuple[Dict[str, float], Any, Any, Any]: A tuple containing: - - A dictionary with evaluation metrics (accuracy, f1_score, roc_auc). - - The confusion matrix. - - The false positive rate. - - The true positive rate. + pd.DataFrame: A DataFrame containing evaluation metrics (accuracy, f1_score, roc_auc), + the confusion matrix, false positive rates, and true positive rates for each class. """ y_pred = self.model.predict(self.X_test) - accuracy = accuracy_score(self.y_test, y_pred) - f1 = f1_score(self.y_test, y_pred, average="weighted") - fpr, tpr, _ = roc_curve( - self.y_test, self.model.predict_proba(self.X_test)[:, 1] - ) - cm = multilabel_confusion_matrix(self.y_test, y_pred) - roc_auc = auc(fpr, tpr) - - # Create DataFrame with consistent structure - results_df = pd.DataFrame( - { - "accuracy": [accuracy], - "f1_score": [f1], - "roc_auc": [roc_auc], - "fpr": [fpr.tolist()], - "cm": [cm.tolist()], - "tpr": [tpr.tolist()], - } - ) + # print(y_pred) + # accuracy = accuracy_score(self.y_test, y_pred) + # f1 = f1_score(self.y_test, y_pred, average="weighted") + + # Binarize the output + # y_test_bin = label_binarize( + # self.y_test, classes=np.unique(self.y_test) + # ) + + # Number of classes + # n_classes = y_test_bin.shape[1] + cm = confusion_matrix(self.y_test, y_pred) + + def calculate_fpr_fnr(cm): + FPR = [] + FNR = [] + num_classes = cm.shape[0] + for i in range(num_classes): + FP = cm[:, i].sum() - cm[i, i] + TN = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i]) + FN = cm[i, :].sum() - cm[i, i] + TP = cm[i, i] + + FPR.append(FP / (FP + TN)) + FNR.append(FN / (FN + TP)) + return np.array(FPR), np.array(FNR) + + # Calculate FPR and FNR for each class + FPR, FNR = calculate_fpr_fnr(cm) + rates_df = pd.DataFrame() + rates_df["FPR"] = FPR + rates_df["FNR"] = FNR + rates_df.index = self.y_labels + + # Calculating macro-averaged F1 Score, Precision, Recall + Precision = precision_score(self.y_test, y_pred, average="macro") + Recall = recall_score(self.y_test, y_pred, average="macro") + F1 = f1_score(self.y_test, y_pred, average="macro") + + # Calculating Accuracy + Accuracy = accuracy_score(self.y_test, y_pred) + + metrics = pd.DataFrame() + metrics["Accuracy"] = [Accuracy] + metrics["Recall"] = [Recall] + metrics["F1"] = [F1] + metrics["Precision"] = [Precision] + + cm = pd.DataFrame(cm) + cm.columns = self.y_labels + cm.index = self.y_labels - return results_df + return rates_df, metrics, cm diff --git a/nmrcraft/models/model_configs.py b/nmrcraft/models/model_configs.py index 75132c5..e4db8ca 100644 --- a/nmrcraft/models/model_configs.py +++ b/nmrcraft/models/model_configs.py @@ -15,7 +15,7 @@ "gradient_boosting": { "model_params": {"random_state": 42}, "hyperparameters": { - "loss": hp.choice("loss", ["log_loss", "exponential"]), + "loss": hp.choice("loss", ["log_loss"]), "learning_rate": hp.uniform("learning_rate", 0.01, 0.5), "n_estimators": hp.choice("n_estimators", range(10, 1000, 10)), # "subsample": hp.uniform("subsample", 0.01, 1.0), @@ -31,17 +31,9 @@ "logistic_regression": { "model_params": {"random_state": 42}, "hyperparameters": { - "penalty": hp.choice("penalty", ["l1", "l2", "elasticnet", None]), "C": hp.uniform("C", 0.01, 10.0), - "solver": hp.choice("solver", ["saga"]), - # lbfgs --> l2, None - # liblinear --> l1, l2 - # newton-cg --> l2, None - # newton-cholesky --> l2, None - # sag --> l2, None - # saga --> l1, l2, elasticnet, None - "max_iter": hp.choice("max_iter", range(100, 1000, 100)), - "l1_ratio": hp.uniform("l1_ratio", 0.01, 1.0), + "solver": hp.choice("solver", ["newton-cg", "sag", "saga"]), + # "max_iter": hp.choice("max_iter", range(100, 1000, 100)), }, }, "svc": { @@ -55,7 +47,6 @@ "gamma": hp.choice("gamma", ["scale", "auto"]), "coef0": hp.uniform("coef0", 0.0, 1.0), "shrinking": hp.choice("shrinking", [True, False]), - "probability": hp.choice("probability", [True, False]), # "max_iter": hp.choice("max_iter", range(100, 1000, 100)), }, }, diff --git a/nmrcraft/models/models.py b/nmrcraft/models/models.py index 1c42e52..39d672b 100644 --- a/nmrcraft/models/models.py +++ b/nmrcraft/models/models.py @@ -69,6 +69,9 @@ def load_model(model_name: str, **kwargs: Any): if model_name == "svc": kwargs["probability"] = True + if model_name == "gpc": + kwargs["multi_class"] = "one_vs_one" + # Forth, validate all provided kwargs before creating the model instance validate_kwargs(kwargs, model_class, model_name) diff --git a/scripts/analysis/dataset_statistics.py b/scripts/analysis/dataset_statistics.py index 217608d..aad9c1e 100644 --- a/scripts/analysis/dataset_statistics.py +++ b/scripts/analysis/dataset_statistics.py @@ -6,7 +6,7 @@ import seaborn as sns from nmrcraft.analysis.plotting import style_setup -from nmrcraft.data.dataset import filename_to_ligands, load_dataset_from_hf +from nmrcraft.data.dataloader import filename_to_ligands, load_dataset_from_hf def plot_stacked_bars( diff --git a/scripts/analysis/pca_ligand_space.py b/scripts/analysis/pca_ligand_space.py index 49f22cf..ecf0a8b 100644 --- a/scripts/analysis/pca_ligand_space.py +++ b/scripts/analysis/pca_ligand_space.py @@ -7,7 +7,7 @@ from sklearn.preprocessing import StandardScaler from nmrcraft.analysis.plotting import style_setup -from nmrcraft.data.dataset import filename_to_ligands, load_dataset_from_hf +from nmrcraft.data.dataloader import filename_to_ligands, load_dataset_from_hf def perform_pca(df, features): diff --git a/scripts/training/baselines.py b/scripts/training/baselines.py new file mode 100644 index 0000000..336463a --- /dev/null +++ b/scripts/training/baselines.py @@ -0,0 +1,120 @@ +import argparse +import logging as log + +import numpy as np +import pandas as pd +from sklearn.metrics import ( + accuracy_score, + confusion_matrix, + f1_score, + precision_score, + recall_score, +) + +# Import your data loading utilities +from nmrcraft.data.dataloader import DataLoader + + +def evaluate_model(y_test, y_pred, y_labels): + metrics = {} + cm_list = [] + target_index = 0 + for target_name, labels in y_labels.items(): + cm = confusion_matrix(y_test[:, target_index], y_pred[:, target_index]) + accuracy = accuracy_score( + y_test[:, target_index], y_pred[:, target_index] + ) + f1 = f1_score( + y_test[:, target_index], y_pred[:, target_index], average="macro" + ) + precision = precision_score( + y_test[:, target_index], + y_pred[:, target_index], + average="macro", + zero_division=0, + ) + recall = recall_score( + y_test[:, target_index], y_pred[:, target_index], average="macro" + ) + # roc_auc = roc_auc_score(y_test[:, target_index], y_pred[:, target_index]) + metrics[target_name] = { + "Accuracy": accuracy, + "F1": f1, + "Precision": precision, + "Recall": recall, + # "ROC-AUC": roc_auc + } + labels = labels + cm_list.append((target_name, cm)) + target_index += 1 + return metrics, cm_list + + +def main(): + parser = argparse.ArgumentParser( + description="Simplified model training script." + ) + parser.add_argument( + "--targets", + type=str, + default=["metal"], + help="The Target for the predictions.", + ) + parser.add_argument( + "--dataset_size", + type=float, + default=1.0, + help="Size of the dataset to load.", + ) + parser.add_argument( + "--random_baseline", + type=bool, + default=False, + help="Use a random baseline model.", + ) + args = parser.parse_args() + + # Set up logging + log.basicConfig( + level=log.INFO, format="%(asctime)s - %(levelname)s - %(message)s" + ) + + # Load data + dataloader = DataLoader( + target_columns=args.targets, + dataset_size=args.dataset_size, + feature_columns=[ + "M_sigma11_ppm", + "M_sigma22_ppm", + "M_sigma33_ppm", + "E_sigma11_ppm", + "E_sigma22_ppm", + "E_sigma33_ppm", + ], + complex_geometry="oct", + test_size=0.3, + random_state=42, + include_structural_features=False, + testing=False, + ) + X_train, X_test, y_train, y_test, y_labels = dataloader.load_data() + + predictions = np.zeros_like(y_test) + + for i in range(len(args.targets)): # Loop through each target column + if args.random_baseline: + unique_vals = np.unique(y_train[:, i]) + predictions[:, i] = np.random.choice(unique_vals, size=len(y_test)) + else: + most_common = pd.Series(y_train[:, i]).mode()[0] + predictions[:, i] = np.full( + shape=y_test[:, i].shape, fill_value=most_common + ) + + # Evaluate the model + metrics, confusion_matrices = evaluate_model(y_test, predictions, y_labels) + log.info("Evaluation Metrics: %s", metrics) + + +if __name__ == "__main__": + main() diff --git a/scripts/training/final_results.py b/scripts/training/one_target.py similarity index 54% rename from scripts/training/final_results.py rename to scripts/training/one_target.py index 1c407db..1ee6472 100644 --- a/scripts/training/final_results.py +++ b/scripts/training/one_target.py @@ -19,13 +19,13 @@ parser.add_argument( "--max_evals", type=int, - default=10, - help="The max evaluatins for the hyperparameter tuning with hyperopt", + default=2, + help="The max evaluations for the hyperparameter tuning with hyperopt", ) parser.add_argument( "--target", type=str, - default="metal", + default="X3", help="The Target for the predictions. Choose from: 'metal', 'X1', 'X2', 'X3', 'X4', 'L', 'E' ", ) parser.add_argument( @@ -54,22 +54,21 @@ log.getLogger().setLevel(log.INFO) dataset_sizes = [ - 0.01, + # 0.01, 0.1, + # 0.15 # 0.5, - 1.0, + # 1.0, ] models = [ - "random_forest", + # "random_forest", "logistic_regression", - "gradient_boosting", - "svc", + # "gradient_boosting", + # "svc", ] with mlflow.start_run(): - model_data = pd.DataFrame( - columns=["accuracy", "f1_score", "dataset_size", "model"] - ) + model_metrics = [] for model in models: data = pd.DataFrame() for dataset_size in dataset_sizes: @@ -79,27 +78,59 @@ max_evals=args.max_evals, target=args.target, dataset_size=dataset_size, - random_state=11, + random_state=42, ) # mlflow.log_metrics("dataset_size", dataset_size, step=i) C.hyperparameter_tune() C.train() - new_data = C.evaluate() + rates_df, metrics, cm = C.evaluate() + print(rates_df) + print(metrics) + print(cm) + # data[str(dataset_size)] = new_data + # Convert args.target and dataset_size into DataFrames by wrapping them in lists + target_df = pd.DataFrame([args.target], columns=["Target"]) + dataset_size_df = pd.DataFrame( + [dataset_size], columns=["Dataset Size"] + ) + + model_data = pd.DataFrame( + columns=[ + "target", + "dataset_size", + "model", + "accuracy", + "accuracy_std", + "f1_score", + "f1_score_std", + ] + ) + # Concatenate the new DataFrames with data and metrics data = pd.concat( - [data, new_data.assign(dataset_size=dataset_size)], + [target_df, dataset_size_df, data, metrics], axis=1 ) - data_BS = C.train_bootstraped(10) + + data_BS = C.train_bootstraped(n_times=10) model_data = pd.concat([model_data, data_BS]) + visualizer = Visualizer( + model_name=model, + cm=cm, + rates=rates_df, + metrics=metrics, + folder_path=args.plot_folder, + classes=C.y_labels, + dataset_size=str(dataset_size), + ) + path_CM = visualizer.plot_confusion_matrix() + # print(data) + data.index = dataset_sizes + model_metrics.append(data) data.index = dataset_sizes - visualizer = Visualizer( - model_name=model, data=data, folder_path=args.plot_folder - ) - path_ROC = visualizer.plot_ROC(filename=f"ROC_Plot_{model}.png") - mlflow.log_artifact(path_ROC, f"ROC_Plot_{model}.png") - print(model_data) + # path_ROC = visualizer.plot_ROC(filename=f"ROC_Plot_{model}.png") + # mlflow.log_artifact(path_ROC, f"ROC_Plot_{model}.png") path_AC = visualizer.plot_metric( data=model_data, @@ -114,5 +145,9 @@ filename="f1_score.png", ) + for df, model in zip(model_metrics, models): + print(model) + print(df) + # mlflow.log_artifact("F1_Plot", path_F1) # mlflow.log_artifact("Accuracy_Plot", path_AC) diff --git a/scripts/training/test.py b/scripts/training/test.py new file mode 100644 index 0000000..b5b3604 --- /dev/null +++ b/scripts/training/test.py @@ -0,0 +1,97 @@ +import argparse + +import mlflow +from sklearn.metrics import ( + accuracy_score, + confusion_matrix, + f1_score, +) + +from nmrcraft.data.dataloader import DataLoader + +# precision_score, +# recall_score, +from nmrcraft.models.model_configs import model_configs +from nmrcraft.models.models import load_model +from nmrcraft.training.hyperparameter_tune import HyperparameterTuner +from nmrcraft.utils.set_seed import set_seed + +set_seed() + + +def main(dataset_size, target, model_name): + # TODO: better experiment naming + mlflow.set_experiment("Ceci_nest_pas_un_experiment") + + with mlflow.start_run(): + config = model_configs[model_name] + + feature_columns = [ + "M_sigma11_ppm", + "M_sigma22_ppm", + "M_sigma33_ppm", + "E_sigma11_ppm", + "E_sigma22_ppm", + "E_sigma33_ppm", + ] + + data_loader = DataLoader( + feature_columns=feature_columns, + target_columns=args.target, + dataset_size=args.dataset_size, + target_type="categorical", + ) + + # Load and preprocess data + X_train, X_test, y_train, y_test, y_labels = data_loader.load_data() + + tuner = HyperparameterTuner(model_name, config, max_evals=1) + best_params, _ = tuner.tune(X_train, y_train) + + model_func = lambda **params: load_model( + model_name, **{**params, **config["model_params"]} + ) + best_model = model_func(**best_params) + best_model.fit(X_train, y_train) + + mlflow.log_params(best_params) + mlflow.log_params( + { + "model_name": model_name, + "dataset_size": dataset_size, + "target": target, + } + ) + + y_pred = best_model.predict(X_test) + cm = confusion_matrix(y_test, y_pred) + ac = accuracy_score(y_test, y_pred) + f1 = f1_score(y_test, y_pred, average="macro") + print(f"Accuracy: {ac}, F1: {f1}, Confusion Matrix:\n{cm}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Train a model with MLflow tracking." + ) + parser.add_argument( + "--dataset_size", + type=float, + default=0.01, + help="Fraction of dataset to use", + ) + parser.add_argument( + "--target", + type=str, + default="X3", + help="Specify the target(s) to select (metal, X1-X4, L, E or combinations of them, e.g., metal_1X_L)", + ) + parser.add_argument( + "--model_name", + type=str, + default="gradient_boosting", + help="Model name to load ('random_forest', 'logistic_regression', 'svc')", + ) + args = parser.parse_args() + + main(args.dataset_size, args.target, args.model_name) diff --git a/scripts/training/train_metal.py b/scripts/training/train_metal.py deleted file mode 100644 index 614784a..0000000 --- a/scripts/training/train_metal.py +++ /dev/null @@ -1,144 +0,0 @@ -import argparse - -import mlflow - -from nmrcraft.analysis.plotting import plot_confusion_matrix, plot_roc_curve -from nmrcraft.data.dataset import DataLoader -from nmrcraft.evaluation.evaluation import ( - get_cm_path, - get_roc_path, - model_evaluation, - model_evaluation_nD, -) -from nmrcraft.models.model_configs import model_configs -from nmrcraft.models.models import load_model -from nmrcraft.training.hyperparameter_tune import HyperparameterTuner -from nmrcraft.utils.set_seed import set_seed - -set_seed() - - -def main(dataset_size, target, model_name): - # TODO: better experiment naming - mlflow.set_experiment("Ceci_nest_pas_un_experiment") - - with mlflow.start_run(): - config = model_configs[model_name] - - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - - data_loader = DataLoader( - feature_columns=feature_columns, - target_columns=args.target, - dataset_size=args.dataset_size, - ) - - # Load and preprocess data - X_train, X_test, y_train, y_test, y_labels = data_loader.load_data() - - tuner = HyperparameterTuner(model_name, config, max_evals=1) - best_params, _ = tuner.tune(X_train, y_train, X_test, y_test) - - model_func = lambda **params: load_model( - model_name, **{**params, **config["model_params"]} - ) - best_model = model_func(**best_params) - best_model.fit(X_train, y_train) - - mlflow.log_params(best_params) - mlflow.log_params( - { - "model_name": model_name, - "dataset_size": dataset_size, - "target": target, - } - ) - - if isinstance(y_test, list): # if target is 1D - metrics, cm, fpr, tpr = model_evaluation( - best_model, X_test, y_test, y_labels, data_loader - ) - - title = r"Confusion matrix, TODO add LaTeX symbols" - plot_confusion_matrix( - cm, - classes=data_loader.confusion_matrix_label_adapter(y_labels), - title=title, - path=get_cm_path(), - ) - # Plot ROC - title = r"ROC curve, TODO add LaTeX symbols" - plot_roc_curve( - fpr, tpr, metrics["roc_auc"], title=title, path=get_roc_path() - ) - # Logging 1D only data - mlflow.log_artifact(get_roc_path()) - - elif ( - data_loader.more_than_one_target() - ): # Multidimensional target Array and Multiple targets - metrics, cm = model_evaluation_nD( - best_model, X_test, y_test, y_labels, data_loader - ) - - title = r"Confusion matrix, TODO add LaTeX symbols" - plot_confusion_matrix( - cm, - classes=data_loader.confusion_matrix_label_adapter(y_labels), - title=title, - path=get_cm_path(), - full=False, - columns_set=data_loader.get_target_columns_separated(), - ) - - else: # Multidimensional target Array and single target - metrics, cm = model_evaluation_nD( - best_model, X_test, y_test, y_labels, data_loader - ) - title = r"Confusion matrix, TODO add LaTeX symbols" - plot_confusion_matrix( - cm, - classes=data_loader.confusion_matrix_label_adapter(y_labels), - title=title, - path=get_cm_path(), - ) - - # Logging common data - mlflow.log_metrics(metrics) - mlflow.sklearn.log_model(best_model, "model") - print(f"Accuracy: {metrics['accuracy']}") - mlflow.log_artifact(get_cm_path()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Train a model with MLflow tracking." - ) - parser.add_argument( - "--dataset_size", - type=float, - default=0.01, - help="Fraction of dataset to use", - ) - parser.add_argument( - "--target", - type=str, - default="X1", - help="Specify the target(s) to select (metal, X1-X4, L, E or combinations of them, e.g., metal_1X_L)", - ) - parser.add_argument( - "--model_name", - type=str, - default="random_forest", - help="Model name to load ('random_forest', 'gradient_boosting', 'logistic_regression', 'svc')", - ) - args = parser.parse_args() - - main(args.dataset_size, args.target, args.model_name) diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index a7670fc..674b5d7 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -1,58 +1,60 @@ -import numpy import pytest -from nmrcraft.data.dataset import DataLoader +from nmrcraft.data.dataloader import DataLoader +# def test_valid_targets(): +# """ +# This tests checks whether some correctly passed --targets go through as expected. +# """ +# feature_columns = [ +# "M_sigma11_ppm", +# "M_sigma22_ppm", +# "M_sigma33_ppm", +# "E_sigma11_ppm", +# "E_sigma22_ppm", +# "E_sigma33_ppm", +# ] -def test_valid_targets(): - """ - This tests checks whether some correctly passed --targets go through as expected. - """ - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - - target_columns_set = [ - "metal", - "metal_X1", - "metal_X1_X2_X3", - "metal_X1_X2_X3_X4_L", - "metal_X1_X2_X3_X4_E", - ] - ys = [] - for target_columns in target_columns_set: - data_loader = DataLoader( - feature_columns=feature_columns, - target_columns=target_columns, - dataset_size=1, - testing=True, - ) - x, x_t, y, y_t, y_cols = data_loader.load_data() - ys.append(y_t) - if isinstance( - y[0], numpy.int64 - ): # if the y_t array is 1D, check if the dimensions are the same - assert isinstance(x, numpy.ndarray) - assert isinstance(y, list) - assert isinstance(y_cols, list) - elif isinstance( - y[0], numpy.ndarray - ): # if the y_t array isn't 1D int array, check if the dimensions are the same on all and if the contents are correct - assert isinstance(x, numpy.ndarray) - assert isinstance(y, numpy.ndarray) - assert isinstance(y_cols, list) - assert len(y_cols) == len(y_t[0]) and len(y[0]) == len(y_t[0]) - assert len(x[0]) == len(x_t[0]) - assert isinstance(x[0][0], numpy.float64) and isinstance( - y[0][0], numpy.int64 - ) - print(ys) - # Here we need to assert if the dimension, content etc of the y_targets are correct. +# target_columns_set = [ +# "metal", +# "metal_X1", +# "metal_X1_X2_X3", +# "metal_X1_X2_X3_X4_L", +# "metal_X1_X2_X3_X4_E", +# ] +# ys = [] +# for target_columns in target_columns_set: +# data_loader = DataLoader( +# feature_columns=feature_columns, +# target_columns=target_columns, +# dataset_size=1, +# testing=True, +# complex_geometry="oct", +# test_size=0.3, +# random_state=42, +# include_structural_features=True +# ) +# x, x_t, y, y_t, y_cols = data_loader.load_data() +# ys.append(y_t) +# if isinstance( +# y[0], numpy.int64 +# ): # if the y_t array is 1D, check if the dimensions are the same +# assert isinstance(x, numpy.ndarray) +# assert isinstance(y, list) +# assert isinstance(y_cols, list) +# elif isinstance( +# y[0], numpy.ndarray +# ): # if the y_t array isn't 1D int array, check if the dimensions are the same on all and if the contents are correct +# assert isinstance(x, numpy.ndarray) +# assert isinstance(y, numpy.ndarray) +# assert isinstance(y_cols, list) +# assert len(y_cols) == len(y_t[0]) and len(y[0]) == len(y_t[0]) +# assert len(x[0]) == len(x_t[0]) +# assert isinstance(x[0][0], numpy.float64) and isinstance( +# y[0][0], numpy.int64 +# ) +# print(ys) +# # Here we need to assert if the dimension, content etc of the y_targets are correct. def test_unsupported_targets(): # Check if unsupported targets get recognized @@ -70,26 +72,9 @@ def test_unsupported_targets(): # Check if unsupported targets get recognized target_columns="metal_X1_R-ligand", dataset_size=1, testing=True, + complex_geometry="oct", + test_size=0.3, + random_state=42, + include_structural_features=True, ) del data_loader - - -def test_unsupported_target_type(): - with pytest.raises(ValueError): - feature_columns = [ - "M_sigma11_ppm", - "M_sigma22_ppm", - "M_sigma33_ppm", - "E_sigma11_ppm", - "E_sigma22_ppm", - "E_sigma33_ppm", - ] - data_loader = DataLoader( - feature_columns=feature_columns, - target_columns="metal_X1_X2_X3_L_E", - dataset_size=1, - testing=True, - target_type="rone-hot-percoding", # wrong type of target - ) - a, b, c, d, e = data_loader.load_data() - del a, b, c, d, e