From d7e0b6679bb085175e4569a383532674fdb2d7c5 Mon Sep 17 00:00:00 2001 From: rakow Date: Tue, 21 May 2024 17:04:08 +0200 Subject: [PATCH 1/7] update mid format for slight variations --- matsim/scenariogen/data/formats/mid.py | 153 ++++++++++++++++++------- 1 file changed, 114 insertions(+), 39 deletions(-) diff --git a/matsim/scenariogen/data/formats/mid.py b/matsim/scenariogen/data/formats/mid.py index 58499a8..1683409 100644 --- a/matsim/scenariogen/data/formats/mid.py +++ b/matsim/scenariogen/data/formats/mid.py @@ -1,10 +1,11 @@ # -*- coding: utf-8 -*- +from collections import Counter + import os import pandas as pd import pyproj import re -from collections import Counter from shapely.geometry import box from shapely.ops import transform @@ -30,24 +31,51 @@ def read_raw(household_file, person_file, trip_file): return hh, p, t +def get_int_id(series, name): + """ Helper function to get integer id from dataframe. """ + + # Sometimes the data set uses the _Lok suffix + if name not in series._fields: + return str(int(getattr(series, name + "_Lok"))) + + return str(int(getattr(series, name))) + +def get(series, *names): + """ Get value from series. Trying possible values until one exists.""" + for name in names: + if name in series._fields: + return getattr(series, name) + + raise Exception("No value found for %s" % names) + + def convert(data: tuple, regio=None): hh, pp, tt = data hhs = [] for h in hh.itertuples(): - hh_id = str(int(h.H_ID_Lok)) - hhs.append(Household(hh_id, h.H_GEW, int(h.hhgr_gr2), int(h.anzauto_gr3), int(h.pedrad), int(h.motmop), - ParkingPosition.NA, - Mid2017.economic_status(h), Mid2017.household_type(h), 0, "", + hh_id = get_int_id(h, "H_ID") + hhs.append(Household(hh_id, get(h, "H_GEW", "gew_hh"), + int(get(h, "hhgr_gr", "hhgr_gr2")), + int(get(h, "anzauto_gr3", "anzauto_gr1")), + int(h.pedrad), + int(get(h, "motmop", "H_ANZMOTMOP")), + ParkingPosition.NA, Mid2017.economic_status(h), Mid2017.household_type(h), 0, "", geom=Mid2017.geom(h))) ps = [] for p in pp.itertuples(): - p_id = str(int(p.HP_ID_Lok)) - hh_id = str(int(p.H_ID_Lok)) + p_id = get_int_id(p, "HP_ID") + + if "H_ID" in p._fields: + hh_id = get_int_id(p, "H_ID") + else: + # Remove last character from person id + hh_id = p_id[:-1] + "0" ps.append(Person( - p_id, p.P_GEW, hh_id, Mid2017.age(p), Mid2017.gender(p), Mid2017.employment(p), None, + p_id, get(p, "P_GEW", "gew_pers"), + hh_id, Mid2017.age(p), Mid2017.gender(p), Mid2017.employment(p), None, Mid2017.driving_license(p), Mid2017.car_avail(p), Mid2017.bike_avail(p), Mid2017.pt_abo_avail(p), Mid2017.mobile(p), Mid2017.present_on_day(p), int(p.ST_WOTAG), int(p.anzwege1) )) @@ -63,8 +91,8 @@ def convert(data: tuple, regio=None): continue # Person might have multiple days - p_id = str(int(t.HP_ID_Lok)) - t_id = p_id + "%d%d" % (t.ST_WOCHE, t.ST_WOTAG) + p_id = get_int_id(t, "HP_ID") + t_id = p_id + "%d%d" % (get(t, "ST_WOCHE", "ST_MONAT"), t.ST_WOTAG) n = counts[t_id] counts[t_id] += 1 @@ -73,8 +101,8 @@ def convert(data: tuple, regio=None): trip = Trip( t_id + "_" + str(n), - t.W_GEW, p_id, str(int(t.H_ID_Lok)), - n, int(t.ST_WOTAG),depature, int(t.wegmin), float(t.wegkm), + get(t, "W_GEW", "gew_wege"), p_id, get(t, "H_ID"), + n, int(t.ST_WOTAG), depature, int(t.wegmin), float(t.wegkm), Mid2017.main_mode(t), Mid2017.purpose(t), None, float(t.wegkm) < 1000 and int(t.wegmin) < 1000 and depature is not None ) @@ -90,10 +118,12 @@ def convert(data: tuple, regio=None): return pd.DataFrame(hhs).set_index("hh_id"), pd.DataFrame(ps).set_index("p_id"), pd.DataFrame(ts).set_index("t_id") + def flip(x, y): """Flips the x and y coordinate values""" return y, x + class Mid2017: """ Converter for 2017 MID format """ @@ -115,49 +145,94 @@ def coordinate(value, grid_size, scale): @staticmethod def geom(h): - if h.GITTER_500m.strip(): + if "GITTER_500m" in h._fields and h.GITTER_500m.strip(): return Mid2017.coordinate(h.GITTER_500m, 500, 100) - elif h.GITTER_1km.strip(): + elif "GITTER_1km" in h._fields and h.GITTER_1km.strip(): return Mid2017.coordinate(h.GITTER_1km, 1000, 1000) - elif h.GITTER_5km.strip(): + elif "GITTER_5km" in h._fields and h.GITTER_5km.strip(): return Mid2017.coordinate(h.GITTER_5km, 5000, 1000) - return None + return pd.NA @staticmethod def economic_status(h): return list(EconomicStatus)[int(h.oek_status) - 1] + @staticmethod + def hh_size(h): + return int(get(h, "hhgr_gr", "hhgr_gr2")) + @staticmethod def household_type(h): - x = int(h.hhtyp2) + x = int(get(h, "hhtyp2", "hhtyp")) if x == 2: return HouseholdType.MULTI_W_CHILDREN elif x == 3: return HouseholdType.MULTI_WO_CHILDREN - return HouseholdType.SINGLE if int(h.hhgr_gr2) == 1 else HouseholdType.MULTI_WO_CHILDREN + return HouseholdType.SINGLE if Mid2017.hh_size(h) == 1 else HouseholdType.MULTI_WO_CHILDREN + + @staticmethod + def hh_size(h): + if "hhgr_gr" in h._fields: + return int(h.hhgr_gr) + + return int(h.hhgr_gr2) + + @staticmethod + def n_cars(h): + # Most detailed group + if "anzauto_gr3" in h._fields: + return int(h.anzauto_gr3) + + # Anzahl Autos im HH in Gruppen (0 bis 4+) + return int(h.anzauto_gr1) @staticmethod def age(p): - x = int(p.alter_gr) - if x == 1: - return 0 - elif x == 2: - return 18 - elif x == 3: - return 30 - elif x == 4: - return 40 - elif x == 5: - return 50 - elif x == 6: - return 60 - elif x == 7: - return 70 - elif x == 8: - return 80 + if "HP_ALTER" in p._fields: + return int(p.HP_ALTER) + + if "alter_gr1" in p._fields: + x = int(p.alter_gr1) + if x == 1: + return 0 + elif x == 2: + return 6 + elif x == 3: + return 10 + elif x == 4: + return 14 + elif x == 5: + return 18 + elif x == 6: + return 25 + elif x == 7: + return 45 + elif x == 8: + return 60 + elif x == 9: + return 65 + + else: + x = int(p.alter_gr) + if x == 1: + return 0 + elif x == 2: + return 18 + elif x == 3: + return 30 + elif x == 4: + return 40 + elif x == 5: + return 50 + elif x == 6: + return 60 + elif x == 7: + return 70 + elif x == 8: + return 80 return -1 @@ -172,7 +247,7 @@ def gender(p): @staticmethod def employment(p): - x = int(p.taet) + x = int(get(p, "taet", "HP_TAET")) if x == 1: return Employment.JOB_FULL_TIME elif x == 2: @@ -267,7 +342,7 @@ def main_mode(t): @staticmethod def purpose(t): - x = int (t.zweck) + x = int(get(t, "zweck", "W_ZWECK")) if x == 1: return Purpose.WORK elif x == 2: @@ -284,7 +359,7 @@ def purpose(t): return Purpose.LEISURE elif x == 8: return Purpose.HOME - elif x == 9: # trip back to previous + elif x == 9: # trip back to previous return Purpose.WAYBACK - return Purpose.OTHER \ No newline at end of file + return Purpose.OTHER From d70193d9504a583f180b0c9c9178fa2f3210a2dd Mon Sep 17 00:00:00 2001 From: rakow Date: Tue, 21 May 2024 17:49:37 +0200 Subject: [PATCH 2/7] update filtering of invalid entries --- matsim/scenariogen/data/formats/mid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matsim/scenariogen/data/formats/mid.py b/matsim/scenariogen/data/formats/mid.py index 1683409..3b1177d 100644 --- a/matsim/scenariogen/data/formats/mid.py +++ b/matsim/scenariogen/data/formats/mid.py @@ -104,7 +104,7 @@ def convert(data: tuple, regio=None): get(t, "W_GEW", "gew_wege"), p_id, get(t, "H_ID"), n, int(t.ST_WOTAG), depature, int(t.wegmin), float(t.wegkm), Mid2017.main_mode(t), Mid2017.purpose(t), None, - float(t.wegkm) < 1000 and int(t.wegmin) < 1000 and depature is not None + float(t.wegkm) < 9994 and int(t.wegmin) < 9994 and depature is not None ) if trip.purpose == Purpose.WAYBACK: From 69e87e31e4faecb94ce54e8f182623b9f0e05584 Mon Sep 17 00:00:00 2001 From: rakow Date: Fri, 7 Jun 2024 18:45:21 +0200 Subject: [PATCH 3/7] add few more attributes to activities --- matsim/scenariogen/data/__init__.py | 5 +++- matsim/scenariogen/data/preparation.py | 34 +++++++++++++++++++------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/matsim/scenariogen/data/__init__.py b/matsim/scenariogen/data/__init__.py index 0555f6e..75ba977 100644 --- a/matsim/scenariogen/data/__init__.py +++ b/matsim/scenariogen/data/__init__.py @@ -366,8 +366,9 @@ class Trip: @dataclass class Activity: - """ Activity information (including leg) """ + """ Activity information (including some leg information as well) """ a_id: str + a_weight: float p_id: str n: int type: Purpose @@ -375,3 +376,5 @@ class Activity: leg_dist: float leg_duration: float leg_mode: TripMode + location: str = pd.NA + zone: str = pd.NA diff --git a/matsim/scenariogen/data/preparation.py b/matsim/scenariogen/data/preparation.py index 2029a75..c134b05 100644 --- a/matsim/scenariogen/data/preparation.py +++ b/matsim/scenariogen/data/preparation.py @@ -95,12 +95,28 @@ def augment_persons(pp, factor=1, permute_age=0.5): """ pp = pp.reset_index() - duplicated = pp.loc[pp.index.repeat(np.maximum(1, np.rint(pp.p_weight * factor)))] + repeats = np.maximum(1, np.rint(pp.p_weight * factor)).to_numpy(int) + duplicated = pp.loc[pp.index.repeat(repeats)] + + # Each sample has a unique sequence number + seq = np.zeros(len(duplicated), dtype=int) + + i = 0 + for r in repeats: + for s in range(r): + seq[i] = s + i += 1 + + duplicated["seq"] = seq if permute_age > 0: np.random.seed(0) age_permute = np.rint(np.random.normal(0, permute_age, len(duplicated))).astype(np.int32) + + # Non duplicated person is not permuted + age_permute[seq == 0] = 0 + duplicated.age += age_permute # 0 age is minimum @@ -264,11 +280,11 @@ def a_id(t_i): return "%s_%d" % (p.Index, t_i) if len(trips) == 0: - acts.append(Activity(a_id(0), p.Index, 0, Purpose.HOME, 1440, 0, 0, TripMode.OTHER)) + acts.append(Activity(a_id(0), p.p_weight, p.Index, 0, Purpose.HOME, 1440, 0, 0, TripMode.OTHER)) else: - acts.append( - Activity(a_id(0), p.Index, 0, trips.iloc[0].sd_group.source(), trips.iloc[0].departure, 0, 0, - TripMode.OTHER)) + trip_0 = trips.iloc[0] + acts.append(Activity(a_id(0), p.p_weight, p.Index, 0, trip_0.sd_group.source(), trip_0.departure, 0, 0, + TripMode.OTHER, trip_0.from_location, trip_0.from_zone)) for i in range(len(trips) - 1): t0 = trips.iloc[i] @@ -279,8 +295,8 @@ def a_id(t_i): if duration < 0 or t0.gis_length < 0: valid = False - acts.append(Activity(a_id(i + 1), p.Index, i + 1, t0.purpose, - duration, t0.gis_length, t0.duration, t0.main_mode)) + acts.append(Activity(a_id(i + 1), p.p_weight, p.Index, i + 1, t0.purpose,duration, t0.gis_length, + t0.duration, t0.main_mode, t0.to_location, t0.to_zone)) if len(trips) > 1: i += 1 @@ -291,8 +307,8 @@ def a_id(t_i): valid = False # Duration is set to rest of day - acts.append( - Activity(a_id(i + 1), p.Index, i + 1, tl.purpose, 1440, tl.gis_length, tl.duration, tl.main_mode)) + acts.append(Activity(a_id(i + 1), p.p_weight, p.Index, i + 1, tl.purpose, 1440, tl.gis_length, tl.duration, tl.main_mode, + tl.to_location, tl.to_zone)) if valid: res.extend(acts) From 4de5a32b4229efe68b0aa55ef9a59a5984edafd4 Mon Sep 17 00:00:00 2001 From: rakow Date: Mon, 10 Jun 2024 17:04:49 +0200 Subject: [PATCH 4/7] calibration cli and script to run multiple simulations at once --- matsim/calibration/__init__.py | 1 + matsim/calibration/__main__.py | 39 ++++----- matsim/calibration/run_create_csv.py | 35 ++++++++ matsim/calibration/run_simulations.py | 121 ++++++++++++++++++++++++++ setup.py | 1 + 5 files changed, 174 insertions(+), 23 deletions(-) create mode 100644 matsim/calibration/run_create_csv.py create mode 100644 matsim/calibration/run_simulations.py diff --git a/matsim/calibration/__init__.py b/matsim/calibration/__init__.py index d999ae6..a0fcb19 100644 --- a/matsim/calibration/__init__.py +++ b/matsim/calibration/__init__.py @@ -213,6 +213,7 @@ def f(trial): if os.name != 'nt': cmd = cmd.split(" ") + cmd = [c for c in cmd if c != ""] p = subprocess.Popen(cmd, stdout=sys.stdout if debug else subprocess.DEVNULL, diff --git a/matsim/calibration/__main__.py b/matsim/calibration/__main__.py index 576f745..08cc4b1 100644 --- a/matsim/calibration/__main__.py +++ b/matsim/calibration/__main__.py @@ -1,36 +1,29 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import optuna -import traceback +from . import run_create_csv +from . import run_simulations -from . import study_as_df +def _add(subparsers, m): + """ Adds module to as subcommand""" + s1 = subparsers.add_parser(m.METADATA[0], help=m.METADATA[1]) + m.setup(s1) + s1.set_defaults(func=m.main) if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(prog="matsim-calibration", description="Calibration CLI") - parser.add_argument('file', nargs=1, type=str, help="Path to input db") - parser.add_argument("--name", type=str, default="calib", help="Calibration name") - parser.add_argument("--output", default=None, help="Output path") - args = parser.parse_args() - - study = optuna.load_study( - study_name=args.name, - storage="sqlite:///%s" % args.file[0], - ) + parser = argparse.ArgumentParser(prog="matsim-calibration", description="MATSim calibration command line utility") - if not args.output: - args.output = args.file[0] + ".csv" + subparsers = parser.add_subparsers(title="Subcommands") - df = study_as_df(study) - df.to_csv(args.output, index=False) + _add(subparsers, run_create_csv) + _add(subparsers, run_simulations) - try: - from .plot import plot_study - plot_study(study) + args = parser.parse_args() - except ImportError: - print("Could not plot study.") - traceback.print_exc() + if not hasattr(args, 'func'): + parser.print_help() + else: + args.func(args) diff --git a/matsim/calibration/run_create_csv.py b/matsim/calibration/run_create_csv.py new file mode 100644 index 0000000..ce6d1d0 --- /dev/null +++ b/matsim/calibration/run_create_csv.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse + +METADATA = "create-csv", "Create plots and csv from calibration study." + +def setup(parser: argparse.ArgumentParser): + parser.add_argument('file', nargs=1, type=str, help="Path to input db") + parser.add_argument("--name", type=str, default="calib", help="Calibration name") + parser.add_argument("--output", default=None, help="Output path") + +def main(args): + + import optuna + from . import study_as_df + + study = optuna.load_study( + study_name=args.name, + storage="sqlite:///%s" % args.file[0], + ) + + if not args.output: + args.output = args.file[0] + ".csv" + + df = study_as_df(study) + df.to_csv(args.output, index=False) + + try: + from .plot import plot_study + plot_study(study) + + except ImportError: + print("Could not plot study.") + traceback.print_exc() \ No newline at end of file diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py new file mode 100644 index 0000000..30b88db --- /dev/null +++ b/matsim/calibration/run_simulations.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse +import os +import subprocess +import sys +from os import makedirs +from time import sleep +from typing import Union, Callable + +METADATA = "run-simulations", "Utility to run multiple simulations at once." + + +def process_results(directory): + """Process results of multiple simulations""" + + print("Processing results in %s" % directory) + + +def run(jar: Union[str, os.PathLike], + config: Union[str, os.PathLike], + args: Union[str, Callable] = "", + jvm_args="", + runs: int = 10, + worker_id: int = 0, + workers: int = 1, + seed: int = 4711, + overwrite: bool = False, + custom_cli: Callable = None, + debug: bool = False): + """Run multiple simulations using different seeds at once. Simulations will be performed sequentially. + For parallel execution, multiple workers must be started. + + :param jar: path to executable jar file of the scenario + :param config: path to config file to run + :param args: arguments to pass to the simulation + :param jvm_args: arguments to pass to the JVM + :param runs: number of simulations to run + :param worker_id: id of this process + :param workers: total number of processes + :param seed: starting seed + :param overwrite: overwrite existing output directory + :param custom_cli: use custom command line interface + :param debug: if true, output will be printed to console + """ + if not os.access(jar, os.R_OK): + raise ValueError("Can not access JAR File: %s" % jar) + + if not os.access(config, os.R_OK): + raise ValueError("Can not access config File: %s" % config) + + if not os.path.exists("eval-runs"): + makedirs("eval-runs") + + for i in range(runs): + if i % workers != worker_id: + continue + + run_dir = "eval-runs/%03d" % i + + if os.path.exists(run_dir) and not overwrite: + print("Run %s already exists, skipping." % run_dir) + continue + + run_args = args(completed) if callable(args) else args + + # Same custom cli interface as calibration + if custom_cli: + cmd = custom_cli(jvm_args, jar, config, params_path, run_dir, trial.number, run_args) + else: + cmd = "java %s -jar %s run --config %s --output %s --runId %03d --config:global.randomSeed=%d %s" \ + % (jvm_args, jar, config, run_dir, i, seed + i, run_args) + + # Extra whitespaces will break argument parsing + cmd = cmd.strip() + + print("Running cmd %s" % cmd) + + if os.name != 'nt': + cmd = cmd.split(" ") + cmd = [c for c in cmd if c != ""] + + p = subprocess.Popen(cmd, + stdout=sys.stdout if debug else subprocess.DEVNULL, + stderr=sys.stderr if debug else subprocess.DEVNULL) + + try: + while p.poll() is None: + sleep(1) + + if p.returncode != 0: + print("The scenario could not be run properly and returned with an error code.", file=sys.stderr) + if not debug: + print("Set debug=True and check the output for any errors.", file=sys.stderr) + print("Alternatively run the cmd from the log above manually and check for errors.", + file=sys.stderr) + + raise Exception("Process returned with error code: %s." % p.returncode) + finally: + p.terminate() + + process_results("eval_runs") + + +def setup(parser: argparse.ArgumentParser): + parser.add_argument("--jar", type=str, required=True, help="Path to executable JAR file") + parser.add_argument("--config", type=str, required=True, help="Path to config file") + parser.add_argument("--args", type=str, default="", help="Arguments to pass to the simulation") + parser.add_argument("--jvm-args", type=str, default="", help="Arguments to pass to the JVM") + parser.add_argument("--runs", type=int, default=10, help="Number of simulations to run") + parser.add_argument("--worker-id", type=int, default=0, help="ID of this worker") + parser.add_argument("--workers", type=int, default=1, help="Total number of workers") + parser.add_argument("--seed", type=int, default=4711, help="Starting seed") + parser.add_argument("--overwrite", action="store_true", help="Overwrite existing output directories") + parser.add_argument("--debug", action="store_true", help="Print output to console") + + +def main(args): + run(args.jar, args.config, args.args, args.jvm_args, args.runs, args.worker_id, args.workers, args.seed, + args.overwrite, debug=args.debug) diff --git a/setup.py b/setup.py index c575579..70609a4 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ 'console_scripts': [ 'matsim-tools=matsim.cli.main:main', 'matsim-scenariogen=matsim.scenariogen:main', + 'matsim-calibration=matsim.calibration:main', 'matsim-viz=matsim.viz:main', ] }, From 51168f4c6486769c8c2e05ddecc734575e8b9050 Mon Sep 17 00:00:00 2001 From: rakow Date: Mon, 10 Jun 2024 17:10:05 +0200 Subject: [PATCH 5/7] fix linting --- matsim/calibration/run_create_csv.py | 1 + matsim/calibration/run_simulations.py | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/matsim/calibration/run_create_csv.py b/matsim/calibration/run_create_csv.py index ce6d1d0..c4a89e3 100644 --- a/matsim/calibration/run_create_csv.py +++ b/matsim/calibration/run_create_csv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import argparse +import traceback METADATA = "create-csv", "Create plots and csv from calibration study." diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py index 30b88db..4b09093 100644 --- a/matsim/calibration/run_simulations.py +++ b/matsim/calibration/run_simulations.py @@ -50,6 +50,9 @@ def run(jar: Union[str, os.PathLike], if not os.access(config, os.R_OK): raise ValueError("Can not access config File: %s" % config) + if worker_id >= workers: + raise ValueError("Worker ID must be smaller than number of workers (starts at 0).") + if not os.path.exists("eval-runs"): makedirs("eval-runs") @@ -63,11 +66,11 @@ def run(jar: Union[str, os.PathLike], print("Run %s already exists, skipping." % run_dir) continue - run_args = args(completed) if callable(args) else args + run_args = args(i) if callable(args) else args - # Same custom cli interface as calibration + # Similar custom cli interface as calibration if custom_cli: - cmd = custom_cli(jvm_args, jar, config, params_path, run_dir, trial.number, run_args) + cmd = custom_cli(jvm_args, jar, config, run_dir, i, seed + i, run_args) else: cmd = "java %s -jar %s run --config %s --output %s --runId %03d --config:global.randomSeed=%d %s" \ % (jvm_args, jar, config, run_dir, i, seed + i, run_args) From 916a0571b70e95b4f00e2cd60c7bcdd6d0742ff2 Mon Sep 17 00:00:00 2001 From: rakow Date: Thu, 13 Jun 2024 20:28:29 +0200 Subject: [PATCH 6/7] implemented log-likelihood calculation --- matsim/calibration/run_simulations.py | 55 +++++++++++++++++++++++++-- setup.py | 2 +- 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py index 4b09093..5660f07 100644 --- a/matsim/calibration/run_simulations.py +++ b/matsim/calibration/run_simulations.py @@ -9,13 +9,62 @@ from time import sleep from typing import Union, Callable +import pandas as pd +import numpy as np + METADATA = "run-simulations", "Utility to run multiple simulations at once." -def process_results(directory): +def process_results(runs): """Process results of multiple simulations""" + from sklearn.metrics import log_loss + from sklearn.preprocessing import LabelEncoder + + print("Processing results in %s" % runs) + + dfs = None + for run in os.listdir(runs): + if not os.path.isdir(os.path.join(runs, run)): + continue + print("Processing run %s" % run) + + df = pd.read_csv(os.path.join(runs, run, "analysis", "population", "mode_choices.csv")) + if dfs is None: + dfs = df + else: + dfs= dfs.merge(df, left_on=["person", "n", "true_mode"], right_on=["person", "n", "true_mode"], suffixes=("", "_%s" % run)) + + shares = dfs.groupby("true_mode").size() / len(dfs) + modes = shares.index + + labels = LabelEncoder().fit(modes) + y_true = labels.transform(dfs["true_mode"]) + y_null = np.tile(shares.to_numpy(), reps=(len(y_true), 1)) + y_pred = np.zeros((len(y_true), len(modes))) + dists = dfs.euclidean_distance.to_numpy() / 1000 + + pred_cols = [c for c in dfs.columns if c.startswith("pred_mode")] + for p in dfs[pred_cols].itertuples(): + + for j, m in enumerate(modes): + c = 0 + for col in pred_cols: + if getattr(p, col) == m: + c += 1 + + y_pred[p.Index, j] = c / len(pred_cols) + + result = [ + ("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=False), -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=False)), + ("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=True), -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=True)), + ("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, normalize=False), -log_loss(y_true, y_null, sample_weight=dfs.weight * dists, normalize=False)), + ("Samples", len(dfs), sum(dists)), + ] + + df = pd.DataFrame(result, columns=["Metric", "Value", "Distance weighted"]).set_index("Metric") + print(df) - print("Processing results in %s" % directory) + df.to_csv(os.path.join(runs, "results.csv"), index=True) def run(jar: Union[str, os.PathLike], @@ -103,7 +152,7 @@ def run(jar: Union[str, os.PathLike], finally: p.terminate() - process_results("eval_runs") + process_results("eval-runs") def setup(parser: argparse.ArgumentParser): diff --git a/setup.py b/setup.py index 70609a4..e12a2fe 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ "pandas >= 2.1.0", ], extras_require={ - 'calibration': ["optuna >= 3.3.0", "shapely >= 1.8.0", "geopandas >= 0.6.0"], + 'calibration': ["optuna >= 3.3.0", "shapely >= 1.8.0", "geopandas >= 0.6.0", "scikit-learn"], # m2cgen has problems with newer xgb, see this issue # https://github.com/BayesWitnesses/m2cgen/issues/581 'scenariogen': ["sumolib", "traci", "lxml", "optax", "requests", "tqdm", "scikit-learn", "xgboost==1.7.1", "lightgbm", From b431704988869729c3d332c9116adddd1503471a Mon Sep 17 00:00:00 2001 From: rakow Date: Sat, 15 Jun 2024 15:13:59 +0200 Subject: [PATCH 7/7] add accuracy score --- matsim/calibration/run_simulations.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py index 5660f07..7a7607d 100644 --- a/matsim/calibration/run_simulations.py +++ b/matsim/calibration/run_simulations.py @@ -17,7 +17,7 @@ def process_results(runs): """Process results of multiple simulations""" - from sklearn.metrics import log_loss + from sklearn.metrics import log_loss, accuracy_score from sklearn.preprocessing import LabelEncoder print("Processing results in %s" % runs) @@ -26,7 +26,6 @@ def process_results(runs): for run in os.listdir(runs): if not os.path.isdir(os.path.join(runs, run)): continue - print("Processing run %s" % run) df = pd.read_csv(os.path.join(runs, run, "analysis", "population", "mode_choices.csv")) if dfs is None: @@ -54,11 +53,16 @@ def process_results(runs): y_pred[p.Index, j] = c / len(pred_cols) + accs = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight) for col in pred_cols] + accs_d = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight * dists) for col in pred_cols] + result = [ ("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=False), -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=False)), ("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=True), -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=True)), + ("Mean Accuracy", np.mean(accs), np.mean(accs_d)), ("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, normalize=False), -log_loss(y_true, y_null, sample_weight=dfs.weight * dists, normalize=False)), ("Samples", len(dfs), sum(dists)), + ("Runs", len(pred_cols), len(pred_cols)) ] df = pd.DataFrame(result, columns=["Metric", "Value", "Distance weighted"]).set_index("Metric")