Skip to content

Commit

Permalink
use custom log loss
Browse files Browse the repository at this point in the history
  • Loading branch information
rakow committed Jul 15, 2024
1 parent ff775c6 commit a9a9b0a
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 2 deletions.
4 changes: 2 additions & 2 deletions matsim/calibration/run_simulations.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

METADATA = "run-simulations", "Utility to run multiple simulations at once."


def likelihood_ratio(ll, ll_null):
return (2 * (ll - ll_null))

Expand Down Expand Up @@ -43,7 +42,8 @@ def sample_y_null(shares: np.array, num_persons: int, num_samples: int):

def process_results(runs):
"""Process results of multiple simulations"""
from sklearn.metrics import log_loss, accuracy_score
from .utils import log_loss
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

print("Processing results in %s" % runs)
Expand Down
93 changes: 93 additions & 0 deletions matsim/calibration/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@
import numpy as np
import pandas as pd

from scipy.special import xlogy
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.utils import (
assert_all_finite,
check_array,
check_consistent_length,
column_or_1d,
)
from sklearn.metrics._classification import _weighted_sum

from optuna.trial import TrialState


Expand Down Expand Up @@ -88,3 +98,86 @@ def _f(jvm_args, jar, config, params_path, run_dir, trial_number, run_args):
)

return _f


def log_loss(y_true, y_pred, *, eps="auto", normalize=True, sample_weight=None, labels=None):
"""Log loss, aka logistic loss or cross-entropy loss. Taken from scikit-learn 1.3."""
y_pred = check_array(
y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
)
if eps == "auto":
eps = np.finfo(y_pred.dtype).eps

check_consistent_length(y_pred, y_true, sample_weight)
lb = LabelBinarizer()

if labels is not None:
lb.fit(labels)
else:
lb.fit(y_true)

if len(lb.classes_) == 1:
if labels is None:
raise ValueError(
"y_true contains only one label ({0}). Please "
"provide the true labels explicitly through the "
"labels argument.".format(lb.classes_[0])
)
else:
raise ValueError(
"The labels array needs to contain at least two "
"labels for log_loss, "
"got {0}.".format(lb.classes_)
)

transformed_labels = lb.transform(y_true)

if transformed_labels.shape[1] == 1:
transformed_labels = np.append(
1 - transformed_labels, transformed_labels, axis=1
)

# Clipping
y_pred = np.clip(y_pred, eps, 1 - eps)

# If y_pred is of single dimension, assume y_true to be binary
# and then check.
if y_pred.ndim == 1:
y_pred = y_pred[:, np.newaxis]
if y_pred.shape[1] == 1:
y_pred = np.append(1 - y_pred, y_pred, axis=1)

# Check if dimensions are consistent.
transformed_labels = check_array(transformed_labels)
if len(lb.classes_) != y_pred.shape[1]:
if labels is None:
raise ValueError(
"y_true and y_pred contain different number of "
"classes {0}, {1}. Please provide the true "
"labels explicitly through the labels argument. "
"Classes found in "
"y_true: {2}".format(
transformed_labels.shape[1], y_pred.shape[1], lb.classes_
)
)
else:
raise ValueError(
"The number of classes in labels is different "
"from that in y_pred. Classes found in "
"labels: {0}".format(lb.classes_)
)

# Renormalize
y_pred_sum = y_pred.sum(axis=1)
if not np.isclose(y_pred_sum, 1, rtol=1e-15, atol=5 * eps).all():
warnings.warn(
(
"The y_pred values do not sum to one. Starting from 1.5 this"
"will result in an error."
),
UserWarning,
)
y_pred = y_pred / y_pred_sum[:, np.newaxis]
loss = -xlogy(transformed_labels, y_pred).sum(axis=1)

return _weighted_sum(loss, sample_weight, normalize)

0 comments on commit a9a9b0a

Please sign in to comment.