Skip to content

Commit

Permalink
Chore/docstrings (#88)
Browse files Browse the repository at this point in the history
* Added docstring to DataLoader class and deleted deprecated tutorial

* Docstrings for data_utils.py

* Add dockstrings to evaluation.py

* Readd deleted function to evaluation with docstrings

* Rename vars in evaluation.py for clarity

* Added docstring to visualizer.py

Wait, are we even using this or will this just be deleted anyway?

* Docstrings for classifier.py

* Added docstring to the add_rows_metrics function

This function is very ugly, however it works and I don't think anyone
should try to reformat this thing

* Add Poster.pdf

* Add Poster.pdf

This time for real

* Added poster into Readme as picture

* Added different resolution posters and defaulted to 200dpi

* Clarify general.py

* Delete assets/Poster_100dpi.png

* Delete assets/Poster_200dpi.png

* Delete assets/Poster_400dpi.png

---------

Co-authored-by: Tiago Würthner <[email protected]>
Co-authored-by: Tiago Würthner <[email protected]>
Co-authored-by: Magdalena Lederbauer <[email protected]>
  • Loading branch information
4 people authored Jun 19, 2024
1 parent d1bd519 commit 4868d4c
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 486 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ When the parameter `max_eval` is set to a high value such as 20, expect the whol

# 🖼️Poster

If you were not able to visit our beautiful poster at ETH Zurich on May 30th 2024, you can access our poster [here](TODO)!
If you were not able to visit our beautiful poster at ETH Zurich on May 30th 2024, you can access our poster [here](assets/Poster.pdf)!

![Poster](assets/Poster_200dpi.png)

# 🧑‍💻 Developing

Expand Down
Binary file added assets/Poster.pdf
Binary file not shown.
Binary file added assets/Poster_1000dpi.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 10 additions & 2 deletions nmrcraft/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,16 @@ def filename_to_ligands(dataset: pd.DataFrame):
return dataset


def load_dummy_dataset_locally(datset_path: str = "tests/data.csv"):
dataset = pd.read_csv(datset_path)
def load_dummy_dataset_locally(dataset_path: str = "tests/data.csv"):
"""
Load a dummy dataset from a local CSV file for testing purposes.
Args:
dataset_path (str, optional): The path to the CSV file containing the dataset. Defaults to "tests/data.csv".
Returns:
pandas.DataFrame: The dataset loaded from the CSV file.
"""
dataset = pd.read_csv(dataset_path)
return dataset


Expand Down
31 changes: 31 additions & 0 deletions nmrcraft/data/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,37 @@


class DataLoader:
"""
DataLoader is responsible for loading and preparing data for machine learning models
in the `nmrcraft` project.
It supports configuration of various dataset parameters including feature selection,
target column specification, dataset size manipulation, and can return split datasets
tuned for training and testing phases.
Parameters:
feature_columns (list of str): Names of columns to be used as features.
target_columns (str): Name(s) of the column(s) used as targets.
test_size (float): Proportion of the dataset to include in the test split.
random_state (int): Seed used by random number generator for reproducibility.
dataset_size (float): Proportion of the full dataset to use.
complex_geometry (str): Specifies the type of complex geometries to include ('oct', 'spy', 'tbp', or 'all').
include_structural_features (bool): Indicates whether structural features should be included in the dataset.
Returns:
dataloader (DataLoader): dataloader object that is used to load and preprocess the dataset.
Example:
>>> data_loader = DataLoader(
feature_columns=["M_sigma11_ppm", "M_sigma22_ppm"],
target_columns="metal X4_ligand E_ligand",
test_size=0.2,
random_state=42,
dataset_size=0.1,
complex_geometry="all",
include_structural_features=True
)
"""

def __init__(
self,
target_columns: str,
Expand Down
45 changes: 35 additions & 10 deletions nmrcraft/evaluation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,25 @@ def evaluate_model(


def evaluate_bootstrap(X_test, y_test, model, targets, n_times=10):
"""
Perform bootstrap evaluation of a model on test data.
This function repeatedly samples with replacement from the test dataset and evaluates
the model on these samples. It aggregates the performance metrics across all bootstrap
samples to give a robust estimate of the model's generalizability.
Args:
X_test (np.ndarray): The input features of the test data.
y_test (np.ndarray): The true labels of the test data.
model (object): The model that is being evaluated.
targets (List[str]): A list of target variable names.
n_times (int, optional): The number of bootstrap samples to generate.
Returns:
Dict[str, Dict[str, List[float]]]: A dictionary containing the computed metrics
for each target. Each target's value is another dictionary containing lists
of performance scores ('Accuracy' and 'F1') across the bootstrap samples.
"""
bootstrap_metrics: Dict = {}
for _ in range(n_times):
X_test, y_test = resample(
Expand All @@ -91,27 +110,33 @@ def evaluate_bootstrap(X_test, y_test, model, targets, n_times=10):

def metrics_statistics(
bootstrapped_metrics,
): # TODO: Handle what to do when there are more than one target -> unify scores or return splitted
"""
Do statistics with the bootsrapped metrics
):
"""Calculate the statistical summary of bootstrapped evaluation metrics with F1 score and Accuracy.
Args:
dict: bootstrapped_metrics
bootstrapped_metrics (dict): A dictionary containing the name of each target with another dictionary
as value, which includes values of the F1 scores and Accuracies of the bootstrapped models.
Returns:
dict: Mean and 95% ci for the bootstrapped values for each target
list: A list containing five elements:
- [0]: List of target names for which metrics are calculated.
- [1]: List of mean accuracies for each target.
- [2]: List of tuples where each tuple consists of the lower and upper bounds of the 95% confidence interval for accuracy for each target.
- [3]: List of mean F1 scores for each target.
- [4]: List of tuples where each tuple consists of the lower and upper bounds of the 95% confidence interval for F1 score for each target.
Each element in the list corresponds to a specific set of statistical values related to the performance metrics (accuracy and F1 score) of the bootstrapped models for each target.
"""
# metrics_stats = pd.DataFrame(columns=["Targets", "Accuracy_mean", "Accuracy_ci", "F1_mean", "F1_ci",])
Targets = []
Accuracy_mean = []
Accuracy_ci = []
F1_mean = []
F1_ci = []

for key, value in bootstrapped_metrics.items():
# calc mean and 95% confidence interval for Accuracy
Targets.append(key)
for target, value in bootstrapped_metrics.items():
Targets.append(target)

# Calculate mean and 95% confidence interval for Accuracy
Accuracy_mean.append(np.mean(value["Accuracy"]))
Accuracy_ci.append(
st.t.interval(
Expand All @@ -122,7 +147,7 @@ def metrics_statistics(
)
)

# calc mean and 95% confidence interval for F1 score
# Calculate mean and 95% confidence interval for F1 score
F1_mean.append(np.mean(value["F1"]))
F1_ci.append(
st.t.interval(
Expand Down
15 changes: 15 additions & 0 deletions nmrcraft/evaluation/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,21 @@ def plot_metric(
title="Title",
filename="Plot.png",
):
"""
Generates a plot for a specified metric against dataset size for different models.
The graph includes error bars representing the standard deviation of the metric.
Args:
data (pd.DataFrame): DataFrame with columns 'model', 'dataset_size', metric, and its standard deviation.
metric (str): Name of the metric to be plotted (e.g., 'accuracy', 'f1_score').
title (str, optional): Plot title. Defaults to "Title".
filename (str, optional): Filename for saving the plot. Defaults to "Plot.png".
Returns:
str: Path where the plot is saved.
"""

for model in data["model"].unique():
model_data = data[data["model"] == model]
std_name = metric + "_std"
Expand Down
68 changes: 63 additions & 5 deletions nmrcraft/models/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,31 @@


class Classifier:
"""
A machine learning classifier for structured data prediction.
This class encapsulates the entire process of model construction, from data loading
and preprocessing, through hyperparameter tuning, to training and evaluation.
Attributes:
model_name (str): Identifier for the model type.
max_evals (int): Maximum number of evaluations for tuning the model's hyperparameters.
target (str): Name of the target variable(s) in the dataset.
dataset_size (float): Size of the dataset to be used.
feature_columns (list, optional): List of feature names to be included in the model. Defaults to a predefined list.
random_state (int, optional): Seed for random number generators for reproducibility. Defaults to 42.
include_structural_features (bool, optional): Flag to include structural features in the data. Defaults to True.
complex_geometry (str, optional): Geometry type associated with the metal complexes. Defaults to 'oct'.
test_size (float, optional): Proportion of the dataset to include in the test split. Defaults to 0.2.
testing (bool, optional): Flag to indicate whether the instance is used for testing, affecting certain behaviors. Defaults to False.
Methods:
hyperparameter_tune: Tunes model parameters using specified algorithms.
train: Fits the model on the training data.
train_bootstrapped: Performs training using bootstrapped samples to gather statistics on models.
evaluate: Assesses model performance on test data.
"""

def __init__(
self,
model_name: str,
Expand Down Expand Up @@ -71,6 +96,10 @@ def __init__(
) = data_loader.load_data()

def hyperparameter_tune(self):
"""
Optimizes model parameters using training data and updates the best_params attribute.
"""

log.info(
f"Performing Hyperparameter tuning for the Model ({self.model_name})"
)
Expand All @@ -80,15 +109,27 @@ def hyperparameter_tune(self):
def train(self):
"""
Train the machine learning model using the best hyperparameters.
Returns:
None
"""

all_params = {**self.model_config["model_params"], **self.best_params}
self.model = load_model(self.model_name, **all_params)
self.model.fit(self.X_train, self.y_train)

def train_bootstrapped(self, n_times=10):
"""
Trains the model using bootstrapping to estimate accuracy and F1 score.
This method resamples the training set with replacement 'n_times', trains the model,
and then evaluates it to collect accuracy and F1 scores. It returns a DataFrame containing
the mean and standard deviation of these metrics.
Args:
n_times (int, optional): Number of bootstrap samples to generate. Defaults to 10.
Returns:
pd.DataFrame: DataFrame containing mean and standard deviation of accuracy and F1 score.
"""

accuracy = []
f1_score = []
i = 0
Expand Down Expand Up @@ -120,8 +161,16 @@ def evaluate(self) -> pd.DataFrame:
Evaluate the performance of the trained machine learning model.
Returns:
pd.DataFrame: A DataFrame containing evaluation metrics (accuracy, f1_score, roc_auc),
the confusion matrix, false positive rates, and true positive rates for each class.
pd.DataFrame: A single-row DataFrame with the following columns:
- 'accuracy' (float)
- 'accuracy_std' (float)
- 'f1_score' (float)
- 'f1_score_std' (float)
- 'dataset_size' (float)
- 'model' (str)
- 'confusion_matrix' (list of lists)
- 'fpr' (list)
- 'tpr' (list)
"""
y_pred = self.model.predict(self.X_test)
# print(y_pred)
Expand All @@ -138,6 +187,15 @@ def evaluate(self) -> pd.DataFrame:
cm = confusion_matrix(self.y_test, y_pred)

def calculate_fpr_fnr(cm):
"""
Calculates the False Positive Rate (FPR) and False Negative Rate (FNR) for each class from a confusion matrix.
Args:
cm (np.ndarray): Confusion matrix.
Returns:
tuple: Two numpy arrays `(FPR, FNR)` containing the FPR and FNR for each class.
"""
FPR = []
FNR = []
num_classes = cm.shape[0]
Expand Down
44 changes: 35 additions & 9 deletions nmrcraft/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,47 @@ def add_rows_metrics(
model_name: str,
max_evals: int,
):
# Add all the newly generated metrics to the unified dataframe targetwise
"""
Compiles and adds a series of statistical metrics into a unified DataFrame, one row at a time.
Args:
statistical_metrics (list): List of lists containing the mean and confidence intervals of
accuracy and F1-score.
dataset_size (int): Number of samples in the dataset.
include_structural (bool): Indicates whether structural data was included in the analysis.
model_name (str): Name of the model that produced the metrics.
max_evals (int): Number of evaluations conducted in the Hyperparameter tuning.
Returns:
unified_metrics (pd.DataFrame): DataFrame with all metrics containing these columns:
target, model_targets, model, nmr_only, dataset_fraction, max_evals, accuracy_mean,
accuracy_lb, accuracy_hb, f1_mean, f1_lb, f1_hb
"""
# Give meaning to indices
idx_name = 0
idx_accuracy_mean = 1
idx_accuracy_ci = 2
idx_f1score_mean = 3
idx_f1score_ci = 4
idx_lb = 0
idx_hb = 1

# Combine all data into single row and append to dataframe
for i in range(len(statistical_metrics[0])):
new_row = [
statistical_metrics[0][i],
statistical_metrics[0],
statistical_metrics[idx_name][i],
statistical_metrics[idx_name],
model_name,
not include_structural,
dataset_size,
max_evals,
statistical_metrics[1][i],
statistical_metrics[2][i][0],
statistical_metrics[2][i][1],
statistical_metrics[3][i],
statistical_metrics[4][i][0],
statistical_metrics[4][i][1],
statistical_metrics[idx_accuracy_mean][i],
statistical_metrics[idx_accuracy_ci][i][idx_lb],
statistical_metrics[idx_accuracy_ci][i][idx_hb],
statistical_metrics[idx_f1score_mean][i],
statistical_metrics[idx_f1score_ci][i][idx_lb],
statistical_metrics[idx_f1score_ci][i][idx_hb],
]
unified_metrics.loc[len(unified_metrics)] = new_row
return unified_metrics
Loading

0 comments on commit 4868d4c

Please sign in to comment.