diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py index cf8247f..eb3c2a6 100644 --- a/matsim/calibration/run_simulations.py +++ b/matsim/calibration/run_simulations.py @@ -24,6 +24,23 @@ def likelihood_ratio_test(ll, ll_null, dof=1): return chi2.sf(likelihood_ratio(ll, ll_null), dof) +def sample_y_null(shares: np.array, num_persons: int, num_samples: int): + """ Replicates a discrete sampling of the null model. For each person, the same number of modes are drawn from the original distribution. + This is done to make the discrete sampling of the simulation comparable to the continous probabilities of the given mode shares. + """ + rng = np.random.default_rng(seed=4711) + + samples = rng.choice(len(shares), (num_persons, num_samples), p=shares) + y_null = np.zeros((num_persons, len(shares))) + + for i, s in enumerate(samples): + for j in range(len(shares)): + c = np.sum(s == j) + y_null[i, j] = c / num_samples + + return y_null + + def process_results(runs): """Process results of multiple simulations""" from sklearn.metrics import log_loss, accuracy_score @@ -53,11 +70,13 @@ def process_results(runs): labels = LabelEncoder().fit(modes) y_true = labels.transform(dfs["true_mode"]) - y_null = np.tile(shares.to_numpy(), reps=(len(y_true), 1)) - y_pred = np.zeros((len(y_true), len(modes))) - dists = dfs.euclidean_distance.to_numpy() / 1000 + dists = dfs.euclidean_distance.to_numpy() / 1000 pred_cols = [c for c in dfs.columns if c.startswith("pred_mode")] + + y_pred = np.zeros((len(y_true), len(modes))) + y_null = sample_y_null(shares.to_numpy(), len(dfs), len(pred_cols)) + for p in dfs[pred_cols].itertuples(): for j, m in enumerate(modes): @@ -68,6 +87,13 @@ def process_results(runs): y_pred[p.Index, j] = c / len(pred_cols) + choices = pd.DataFrame(data=y_pred, columns=modes) + choices.insert(0, "person", dfs.person) + choices.insert(1, "n", dfs.n) + choices.insert(2, "true_mode", dfs.true_mode) + + choices.to_csv(os.path.join(runs, "choices.csv"), index=False) + accs = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight) for col in pred_cols] accs_d = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight * dists) for col in pred_cols] diff --git a/matsim/scenariogen/data/formats/mid.py b/matsim/scenariogen/data/formats/mid.py index 3b1177d..717e624 100644 --- a/matsim/scenariogen/data/formats/mid.py +++ b/matsim/scenariogen/data/formats/mid.py @@ -101,7 +101,7 @@ def convert(data: tuple, regio=None): trip = Trip( t_id + "_" + str(n), - get(t, "W_GEW", "gew_wege"), p_id, get(t, "H_ID"), + get(t, "W_GEW", "gew_wege"), p_id, str(get_int_id(t, "H_ID")), n, int(t.ST_WOTAG), depature, int(t.wegmin), float(t.wegkm), Mid2017.main_mode(t), Mid2017.purpose(t), None, float(t.wegkm) < 9994 and int(t.wegmin) < 9994 and depature is not None