From a9a9b0a8317d23b00b1c09c8e3582d6b806a490d Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Mon, 15 Jul 2024 12:47:21 +0200
Subject: [PATCH] use custom log loss

---
 matsim/calibration/run_simulations.py |  4 +-
 matsim/calibration/utils.py           | 93 +++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py
index c077fae..a698a07 100644
--- a/matsim/calibration/run_simulations.py
+++ b/matsim/calibration/run_simulations.py
@@ -14,7 +14,6 @@
 
 METADATA = "run-simulations", "Utility to run multiple simulations at once."
 
-
 def likelihood_ratio(ll, ll_null):
     return (2 * (ll - ll_null))
 
@@ -43,7 +42,8 @@ def sample_y_null(shares: np.array, num_persons: int, num_samples: int):
 
 def process_results(runs):
     """Process results of multiple simulations"""
-    from sklearn.metrics import log_loss, accuracy_score
+    from .utils import log_loss
+    from sklearn.metrics import accuracy_score
     from sklearn.preprocessing import LabelEncoder
 
     print("Processing results in %s" % runs)
diff --git a/matsim/calibration/utils.py b/matsim/calibration/utils.py
index 9594f28..b954b74 100644
--- a/matsim/calibration/utils.py
+++ b/matsim/calibration/utils.py
@@ -3,6 +3,16 @@
 import numpy as np
 import pandas as pd
 
+from scipy.special import xlogy
+from sklearn.preprocessing import LabelBinarizer, LabelEncoder
+from sklearn.utils import (
+    assert_all_finite,
+    check_array,
+    check_consistent_length,
+    column_or_1d,
+)
+from sklearn.metrics._classification import _weighted_sum
+
 from optuna.trial import TrialState
 
 
@@ -88,3 +98,86 @@ def _f(jvm_args, jar, config, params_path, run_dir, trial_number, run_args):
         )
 
     return _f
+
+
+def log_loss(y_true, y_pred, *, eps="auto", normalize=True, sample_weight=None, labels=None):
+    """Log loss, aka logistic loss or cross-entropy loss. Taken from scikit-learn 1.3."""
+    y_pred = check_array(
+        y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
+    )
+    if eps == "auto":
+        eps = np.finfo(y_pred.dtype).eps
+
+    check_consistent_length(y_pred, y_true, sample_weight)
+    lb = LabelBinarizer()
+
+    if labels is not None:
+        lb.fit(labels)
+    else:
+        lb.fit(y_true)
+
+    if len(lb.classes_) == 1:
+        if labels is None:
+            raise ValueError(
+                "y_true contains only one label ({0}). Please "
+                "provide the true labels explicitly through the "
+                "labels argument.".format(lb.classes_[0])
+            )
+        else:
+            raise ValueError(
+                "The labels array needs to contain at least two "
+                "labels for log_loss, "
+                "got {0}.".format(lb.classes_)
+            )
+
+    transformed_labels = lb.transform(y_true)
+
+    if transformed_labels.shape[1] == 1:
+        transformed_labels = np.append(
+            1 - transformed_labels, transformed_labels, axis=1
+        )
+
+    # Clipping
+    y_pred = np.clip(y_pred, eps, 1 - eps)
+
+    # If y_pred is of single dimension, assume y_true to be binary
+    # and then check.
+    if y_pred.ndim == 1:
+        y_pred = y_pred[:, np.newaxis]
+    if y_pred.shape[1] == 1:
+        y_pred = np.append(1 - y_pred, y_pred, axis=1)
+
+    # Check if dimensions are consistent.
+    transformed_labels = check_array(transformed_labels)
+    if len(lb.classes_) != y_pred.shape[1]:
+        if labels is None:
+            raise ValueError(
+                "y_true and y_pred contain different number of "
+                "classes {0}, {1}. Please provide the true "
+                "labels explicitly through the labels argument. "
+                "Classes found in "
+                "y_true: {2}".format(
+                    transformed_labels.shape[1], y_pred.shape[1], lb.classes_
+                )
+            )
+        else:
+            raise ValueError(
+                "The number of classes in labels is different "
+                "from that in y_pred. Classes found in "
+                "labels: {0}".format(lb.classes_)
+            )
+
+    # Renormalize
+    y_pred_sum = y_pred.sum(axis=1)
+    if not np.isclose(y_pred_sum, 1, rtol=1e-15, atol=5 * eps).all():
+        warnings.warn(
+            (
+                "The y_pred values do not sum to one. Starting from 1.5 this"
+                "will result in an error."
+            ),
+            UserWarning,
+        )
+    y_pred = y_pred / y_pred_sum[:, np.newaxis]
+    loss = -xlogy(transformed_labels, y_pred).sum(axis=1)
+
+    return _weighted_sum(loss, sample_weight, normalize)
\ No newline at end of file