From 7aacddd90f66b3494e8bcbd1d6a9ab3059c7dbd9 Mon Sep 17 00:00:00 2001 From: Matt Warren Date: Wed, 5 Jun 2024 15:53:35 +0100 Subject: [PATCH] Add fraction best ligands metric --- cinnabar/classification_metric.py | 125 ++++++++++++++++++++++++++++-- 1 file changed, 120 insertions(+), 5 deletions(-) diff --git a/cinnabar/classification_metric.py b/cinnabar/classification_metric.py index 059a0ed..26eb31d 100644 --- a/cinnabar/classification_metric.py +++ b/cinnabar/classification_metric.py @@ -5,6 +5,7 @@ from scipy import stats import numpy as np import ast +import math def _experiment_prediction_binning(experiment_dG: Iterable[float], predict_dG: Iterable[float], n_classes:int =2, best_class_fraction:float=None): @@ -111,12 +112,126 @@ def acc_boots_tfunc(data): return acc, s.standard_error -def FBVL(experiment_dG:Iterable[float], perdict_dG: Iterable[float], max_best_molecules_ratio:float=0.5)->float: +def _create_2d_histogram(y_true, y_pred): """ - Metric inspired by the talk of Chris Bailey on alchemistry 2024 + Create a 2D histogram from two arrays of data. + + Parameters + ---------- + y_true : array-like + The true values. + y_pred : array-like + The predicted values. + + Returns + ------- + histogram : ndarray + The 2D histogram of the input data. + bins_true : ndarray + The bin edges along the y_true axis. + bins_pred : ndarray + The bin edges along the y_pred axis. + + Raises + ------ + ValueError + If `y_true` and `y_pred` have different lengths. + TypeError + If `y_true` or `y_pred` cannot be converted to numpy arrays. """ - raise NotImplementedError() - fbvl_score = 0 + try: + y_true = np.asarray(y_true) + y_pred = np.asarray(y_pred) + except Exception as e: + raise TypeError("Input data cannot be converted to numpy arrays.") from e + + if y_true.shape != y_pred.shape: + raise ValueError("y_true and y_pred must have the same length.") + + y_true_sorted = np.sort(y_true) + y_pred_sorted = np.sort(y_pred) + + bins_true = np.concatenate(([y_true.min()], (y_true_sorted[:-1] + y_true_sorted[1:]) / 2, [y_true.max()])) + bins_pred = np.concatenate(([y_pred.min()], (y_pred_sorted[:-1] + y_pred_sorted[1:]) / 2, [y_pred.max()])) + + histogram, bins_true, bins_pred = np.histogram2d(y_true, y_pred, bins=[bins_true, bins_pred]) + + return histogram, bins_true, bins_pred + + +def _compute_overlap_coefficient(histogram, ranking): + """ + Compute the overlap coefficient from a 2D histogram. + + The overlap coefficient is calculated based on the counts in the histogram + for the top N ranked ligands (most active). + + Parameters + ---------- + histogram : ndarray + A 2D histogram array where the counts are stored. + ranking : int + The number of rankings to consider when computing overlap. + + Returns + ------- + float + The overlap coefficient. + + Raises + ------ + ValueError + If `top_n_ligands` is greater than the number of ligands in the histogram. + """ + if ranking < 1: + raise ValueError("Ranking must be greater than 0.") + + if histogram.shape[0] < ranking: + raise ValueError("Ranking must be less than the number of ligands.") + + overlap = np.sum(histogram[:ranking, :ranking]) + + return overlap / ranking + + +def compute_fraction_best_ligands(y_true, y_pred, fraction=0.5): + """ + Compute the fraction of the best ligands metric introduced by Chris Bayly. + + This function calculates the fraction of the best ligands by computing overlap + coefficients for each ranking up to the number of ligands and then averaging up to the specified fraction. + + Parameters + ---------- + y_true : array-like + The true values. + y_pred : array-like + The predicted values. + fraction : float, optional + The fraction of ligands to consider as the best (default is 0.5). + + Returns + ------- + float + The computed fraction of the best ligands. + + Raises + ------ + ValueError + If `fraction` is not between 0 and 1. + """ + + if not (0 <= fraction <= 1): + raise ValueError("Fraction must be between 0 and 1.") + + histogram = _create_2d_histogram(y_true, y_pred)[0] + num_ligands = histogram.shape[0] + num_best_ligands = math.floor(num_ligands * fraction) + + overlap_coefficients = [_compute_overlap_coefficient(histogram, i + 1) for i in range(num_ligands)] + best_coefficients = overlap_coefficients[:num_best_ligands] + + fraction_best_ligands = sum(best_coefficients) / num_best_ligands - return fbvl_score \ No newline at end of file + return fraction_best_ligands \ No newline at end of file