From a9a9b0a8317d23b00b1c09c8e3582d6b806a490d Mon Sep 17 00:00:00 2001 From: rakow Date: Mon, 15 Jul 2024 12:47:21 +0200 Subject: [PATCH] use custom log loss --- matsim/calibration/run_simulations.py | 4 +- matsim/calibration/utils.py | 93 +++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py index c077fae..a698a07 100644 --- a/matsim/calibration/run_simulations.py +++ b/matsim/calibration/run_simulations.py @@ -14,7 +14,6 @@ METADATA = "run-simulations", "Utility to run multiple simulations at once." - def likelihood_ratio(ll, ll_null): return (2 * (ll - ll_null)) @@ -43,7 +42,8 @@ def sample_y_null(shares: np.array, num_persons: int, num_samples: int): def process_results(runs): """Process results of multiple simulations""" - from sklearn.metrics import log_loss, accuracy_score + from .utils import log_loss + from sklearn.metrics import accuracy_score from sklearn.preprocessing import LabelEncoder print("Processing results in %s" % runs) diff --git a/matsim/calibration/utils.py b/matsim/calibration/utils.py index 9594f28..b954b74 100644 --- a/matsim/calibration/utils.py +++ b/matsim/calibration/utils.py @@ -3,6 +3,16 @@ import numpy as np import pandas as pd +from scipy.special import xlogy +from sklearn.preprocessing import LabelBinarizer, LabelEncoder +from sklearn.utils import ( + assert_all_finite, + check_array, + check_consistent_length, + column_or_1d, +) +from sklearn.metrics._classification import _weighted_sum + from optuna.trial import TrialState @@ -88,3 +98,86 @@ def _f(jvm_args, jar, config, params_path, run_dir, trial_number, run_args): ) return _f + + +def log_loss(y_true, y_pred, *, eps="auto", normalize=True, sample_weight=None, labels=None): + """Log loss, aka logistic loss or cross-entropy loss. Taken from scikit-learn 1.3.""" + y_pred = check_array( + y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16] + ) + if eps == "auto": + eps = np.finfo(y_pred.dtype).eps + + check_consistent_length(y_pred, y_true, sample_weight) + lb = LabelBinarizer() + + if labels is not None: + lb.fit(labels) + else: + lb.fit(y_true) + + if len(lb.classes_) == 1: + if labels is None: + raise ValueError( + "y_true contains only one label ({0}). Please " + "provide the true labels explicitly through the " + "labels argument.".format(lb.classes_[0]) + ) + else: + raise ValueError( + "The labels array needs to contain at least two " + "labels for log_loss, " + "got {0}.".format(lb.classes_) + ) + + transformed_labels = lb.transform(y_true) + + if transformed_labels.shape[1] == 1: + transformed_labels = np.append( + 1 - transformed_labels, transformed_labels, axis=1 + ) + + # Clipping + y_pred = np.clip(y_pred, eps, 1 - eps) + + # If y_pred is of single dimension, assume y_true to be binary + # and then check. + if y_pred.ndim == 1: + y_pred = y_pred[:, np.newaxis] + if y_pred.shape[1] == 1: + y_pred = np.append(1 - y_pred, y_pred, axis=1) + + # Check if dimensions are consistent. + transformed_labels = check_array(transformed_labels) + if len(lb.classes_) != y_pred.shape[1]: + if labels is None: + raise ValueError( + "y_true and y_pred contain different number of " + "classes {0}, {1}. Please provide the true " + "labels explicitly through the labels argument. " + "Classes found in " + "y_true: {2}".format( + transformed_labels.shape[1], y_pred.shape[1], lb.classes_ + ) + ) + else: + raise ValueError( + "The number of classes in labels is different " + "from that in y_pred. Classes found in " + "labels: {0}".format(lb.classes_) + ) + + # Renormalize + y_pred_sum = y_pred.sum(axis=1) + if not np.isclose(y_pred_sum, 1, rtol=1e-15, atol=5 * eps).all(): + warnings.warn( + ( + "The y_pred values do not sum to one. Starting from 1.5 this" + "will result in an error." + ), + UserWarning, + ) + y_pred = y_pred / y_pred_sum[:, np.newaxis] + loss = -xlogy(transformed_labels, y_pred).sum(axis=1) + + return _weighted_sum(loss, sample_weight, normalize) \ No newline at end of file