From d7e0b6679bb085175e4569a383532674fdb2d7c5 Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Tue, 21 May 2024 17:04:08 +0200
Subject: [PATCH 1/7] update mid format for slight variations

---
 matsim/scenariogen/data/formats/mid.py | 153 ++++++++++++++++++-------
 1 file changed, 114 insertions(+), 39 deletions(-)

diff --git a/matsim/scenariogen/data/formats/mid.py b/matsim/scenariogen/data/formats/mid.py
index 58499a8..1683409 100644
--- a/matsim/scenariogen/data/formats/mid.py
+++ b/matsim/scenariogen/data/formats/mid.py
@@ -1,10 +1,11 @@
 # -*- coding: utf-8 -*-
 
+from collections import Counter
+
 import os
 import pandas as pd
 import pyproj
 import re
-from collections import Counter
 from shapely.geometry import box
 from shapely.ops import transform
 
@@ -30,24 +31,51 @@ def read_raw(household_file, person_file, trip_file):
     return hh, p, t
 
 
+def get_int_id(series, name):
+    """ Helper function to get integer id from dataframe. """
+
+    # Sometimes the data set uses the _Lok suffix
+    if name not in series._fields:
+        return str(int(getattr(series, name + "_Lok")))
+
+    return str(int(getattr(series, name)))
+
+def get(series, *names):
+    """ Get value from series. Trying possible values until one exists."""
+    for name in names:
+        if name in series._fields:
+            return getattr(series, name)
+
+    raise Exception("No value found for %s" % names)
+
+
 def convert(data: tuple, regio=None):
     hh, pp, tt = data
 
     hhs = []
     for h in hh.itertuples():
-        hh_id = str(int(h.H_ID_Lok))
-        hhs.append(Household(hh_id, h.H_GEW, int(h.hhgr_gr2), int(h.anzauto_gr3), int(h.pedrad), int(h.motmop),
-                             ParkingPosition.NA,
-                             Mid2017.economic_status(h), Mid2017.household_type(h), 0, "",
+        hh_id = get_int_id(h, "H_ID")
+        hhs.append(Household(hh_id, get(h, "H_GEW", "gew_hh"),
+                             int(get(h, "hhgr_gr", "hhgr_gr2")),
+                             int(get(h, "anzauto_gr3", "anzauto_gr1")),
+                             int(h.pedrad),
+                             int(get(h, "motmop", "H_ANZMOTMOP")),
+                             ParkingPosition.NA, Mid2017.economic_status(h), Mid2017.household_type(h), 0, "",
                              geom=Mid2017.geom(h)))
 
     ps = []
     for p in pp.itertuples():
-        p_id = str(int(p.HP_ID_Lok))
-        hh_id = str(int(p.H_ID_Lok))
+        p_id = get_int_id(p, "HP_ID")
+
+        if "H_ID" in p._fields:
+            hh_id = get_int_id(p, "H_ID")
+        else:
+            # Remove last character from person id
+            hh_id = p_id[:-1] + "0"
 
         ps.append(Person(
-            p_id, p.P_GEW, hh_id, Mid2017.age(p), Mid2017.gender(p), Mid2017.employment(p), None,
+            p_id, get(p, "P_GEW", "gew_pers"),
+            hh_id, Mid2017.age(p), Mid2017.gender(p), Mid2017.employment(p), None,
             Mid2017.driving_license(p), Mid2017.car_avail(p), Mid2017.bike_avail(p), Mid2017.pt_abo_avail(p),
             Mid2017.mobile(p), Mid2017.present_on_day(p), int(p.ST_WOTAG), int(p.anzwege1)
         ))
@@ -63,8 +91,8 @@ def convert(data: tuple, regio=None):
             continue
 
         # Person might have multiple days
-        p_id = str(int(t.HP_ID_Lok))
-        t_id = p_id + "%d%d" % (t.ST_WOCHE, t.ST_WOTAG)
+        p_id = get_int_id(t, "HP_ID")
+        t_id = p_id + "%d%d" % (get(t, "ST_WOCHE", "ST_MONAT"), t.ST_WOTAG)
 
         n = counts[t_id]
         counts[t_id] += 1
@@ -73,8 +101,8 @@ def convert(data: tuple, regio=None):
 
         trip = Trip(
             t_id + "_" + str(n),
-            t.W_GEW, p_id, str(int(t.H_ID_Lok)),
-            n, int(t.ST_WOTAG),depature, int(t.wegmin), float(t.wegkm),
+            get(t, "W_GEW", "gew_wege"), p_id, get(t, "H_ID"),
+            n, int(t.ST_WOTAG), depature, int(t.wegmin), float(t.wegkm),
             Mid2017.main_mode(t), Mid2017.purpose(t), None,
             float(t.wegkm) < 1000 and int(t.wegmin) < 1000 and depature is not None
         )
@@ -90,10 +118,12 @@ def convert(data: tuple, regio=None):
 
     return pd.DataFrame(hhs).set_index("hh_id"), pd.DataFrame(ps).set_index("p_id"), pd.DataFrame(ts).set_index("t_id")
 
+
 def flip(x, y):
     """Flips the x and y coordinate values"""
     return y, x
 
+
 class Mid2017:
     """ Converter for 2017 MID format """
 
@@ -115,49 +145,94 @@ def coordinate(value, grid_size, scale):
 
     @staticmethod
     def geom(h):
-        if h.GITTER_500m.strip():
+        if "GITTER_500m" in h._fields and h.GITTER_500m.strip():
             return Mid2017.coordinate(h.GITTER_500m, 500, 100)
-        elif h.GITTER_1km.strip():
+        elif "GITTER_1km" in h._fields and h.GITTER_1km.strip():
             return Mid2017.coordinate(h.GITTER_1km, 1000, 1000)
-        elif h.GITTER_5km.strip():
+        elif "GITTER_5km" in h._fields and h.GITTER_5km.strip():
             return Mid2017.coordinate(h.GITTER_5km, 5000, 1000)
 
-        return None
+        return pd.NA
 
     @staticmethod
     def economic_status(h):
         return list(EconomicStatus)[int(h.oek_status) - 1]
 
+    @staticmethod
+    def hh_size(h):
+        return int(get(h, "hhgr_gr", "hhgr_gr2"))
+
     @staticmethod
     def household_type(h):
-        x = int(h.hhtyp2)
+        x = int(get(h, "hhtyp2", "hhtyp"))
 
         if x == 2:
             return HouseholdType.MULTI_W_CHILDREN
         elif x == 3:
             return HouseholdType.MULTI_WO_CHILDREN
 
-        return HouseholdType.SINGLE if int(h.hhgr_gr2) == 1 else HouseholdType.MULTI_WO_CHILDREN
+        return HouseholdType.SINGLE if Mid2017.hh_size(h) == 1 else HouseholdType.MULTI_WO_CHILDREN
+
+    @staticmethod
+    def hh_size(h):
+        if "hhgr_gr" in h._fields:
+            return int(h.hhgr_gr)
+
+        return int(h.hhgr_gr2)
+
+    @staticmethod
+    def n_cars(h):
+        # Most detailed group
+        if "anzauto_gr3" in h._fields:
+            return int(h.anzauto_gr3)
+
+        # Anzahl Autos im HH in Gruppen (0 bis 4+)
+        return int(h.anzauto_gr1)
 
     @staticmethod
     def age(p):
-        x = int(p.alter_gr)
-        if x == 1:
-            return 0
-        elif x == 2:
-            return 18
-        elif x == 3:
-            return 30
-        elif x == 4:
-            return 40
-        elif x == 5:
-            return 50
-        elif x == 6:
-            return 60
-        elif x == 7:
-            return 70
-        elif x == 8:
-            return 80
+        if "HP_ALTER" in p._fields:
+            return int(p.HP_ALTER)
+
+        if "alter_gr1" in p._fields:
+            x = int(p.alter_gr1)
+            if x == 1:
+                return 0
+            elif x == 2:
+                return 6
+            elif x == 3:
+                return 10
+            elif x == 4:
+                return 14
+            elif x == 5:
+                return 18
+            elif x == 6:
+                return 25
+            elif x == 7:
+                return 45
+            elif x == 8:
+                return 60
+            elif x == 9:
+                return 65
+
+        else:
+            x = int(p.alter_gr)
+            if x == 1:
+                return 0
+            elif x == 2:
+                return 18
+            elif x == 3:
+                return 30
+            elif x == 4:
+                return 40
+            elif x == 5:
+                return 50
+            elif x == 6:
+                return 60
+            elif x == 7:
+                return 70
+            elif x == 8:
+                return 80
 
         return -1
 
@@ -172,7 +247,7 @@ def gender(p):
 
     @staticmethod
     def employment(p):
-        x = int(p.taet)
+        x = int(get(p, "taet", "HP_TAET"))
         if x == 1:
             return Employment.JOB_FULL_TIME
         elif x == 2:
@@ -267,7 +342,7 @@ def main_mode(t):
 
     @staticmethod
     def purpose(t):
-        x = int (t.zweck)
+        x = int(get(t, "zweck", "W_ZWECK"))
         if x == 1:
             return Purpose.WORK
         elif x == 2:
@@ -284,7 +359,7 @@ def purpose(t):
             return Purpose.LEISURE
         elif x == 8:
             return Purpose.HOME
-        elif x == 9: # trip back to previous
+        elif x == 9:  # trip back to previous
             return Purpose.WAYBACK
 
-        return Purpose.OTHER
\ No newline at end of file
+        return Purpose.OTHER

From d70193d9504a583f180b0c9c9178fa2f3210a2dd Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Tue, 21 May 2024 17:49:37 +0200
Subject: [PATCH 2/7] update filtering of invalid entries

---
 matsim/scenariogen/data/formats/mid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/matsim/scenariogen/data/formats/mid.py b/matsim/scenariogen/data/formats/mid.py
index 1683409..3b1177d 100644
--- a/matsim/scenariogen/data/formats/mid.py
+++ b/matsim/scenariogen/data/formats/mid.py
@@ -104,7 +104,7 @@ def convert(data: tuple, regio=None):
             get(t, "W_GEW", "gew_wege"), p_id, get(t, "H_ID"),
             n, int(t.ST_WOTAG), depature, int(t.wegmin), float(t.wegkm),
             Mid2017.main_mode(t), Mid2017.purpose(t), None,
-            float(t.wegkm) < 1000 and int(t.wegmin) < 1000 and depature is not None
+            float(t.wegkm) < 9994 and int(t.wegmin) < 9994 and depature is not None
         )
 
         if trip.purpose == Purpose.WAYBACK:

From 69e87e31e4faecb94ce54e8f182623b9f0e05584 Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Fri, 7 Jun 2024 18:45:21 +0200
Subject: [PATCH 3/7] add few more attributes to activities

---
 matsim/scenariogen/data/__init__.py    |  5 +++-
 matsim/scenariogen/data/preparation.py | 34 +++++++++++++++++++-------
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/matsim/scenariogen/data/__init__.py b/matsim/scenariogen/data/__init__.py
index 0555f6e..75ba977 100644
--- a/matsim/scenariogen/data/__init__.py
+++ b/matsim/scenariogen/data/__init__.py
@@ -366,8 +366,9 @@ class Trip:
 
 @dataclass
 class Activity:
-    """ Activity information (including leg) """
+    """ Activity information (including some leg information as well) """
     a_id: str
+    a_weight: float
     p_id: str
     n: int
     type: Purpose
@@ -375,3 +376,5 @@ class Activity:
     leg_dist: float
     leg_duration: float
     leg_mode: TripMode
+    location: str = pd.NA
+    zone: str = pd.NA
diff --git a/matsim/scenariogen/data/preparation.py b/matsim/scenariogen/data/preparation.py
index 2029a75..c134b05 100644
--- a/matsim/scenariogen/data/preparation.py
+++ b/matsim/scenariogen/data/preparation.py
@@ -95,12 +95,28 @@ def augment_persons(pp, factor=1, permute_age=0.5):
     """
     pp = pp.reset_index()
 
-    duplicated = pp.loc[pp.index.repeat(np.maximum(1, np.rint(pp.p_weight * factor)))]
+    repeats = np.maximum(1, np.rint(pp.p_weight * factor)).to_numpy(int)
+    duplicated = pp.loc[pp.index.repeat(repeats)]
+
+    # Each sample has a unique sequence number
+    seq = np.zeros(len(duplicated), dtype=int)
+
+    i = 0
+    for r in repeats:
+        for s in range(r):
+            seq[i] = s
+            i += 1
+
+    duplicated["seq"] = seq
 
     if permute_age > 0:
         np.random.seed(0)
 
         age_permute = np.rint(np.random.normal(0, permute_age, len(duplicated))).astype(np.int32)
+
+        # Non duplicated person is not permuted
+        age_permute[seq == 0] = 0
+
         duplicated.age += age_permute
 
         # 0 age is minimum
@@ -264,11 +280,11 @@ def a_id(t_i):
                 return "%s_%d" % (p.Index, t_i)
 
             if len(trips) == 0:
-                acts.append(Activity(a_id(0), p.Index, 0, Purpose.HOME, 1440, 0, 0, TripMode.OTHER))
+                acts.append(Activity(a_id(0), p.p_weight, p.Index, 0, Purpose.HOME, 1440, 0, 0, TripMode.OTHER))
             else:
-                acts.append(
-                    Activity(a_id(0), p.Index, 0, trips.iloc[0].sd_group.source(), trips.iloc[0].departure, 0, 0,
-                             TripMode.OTHER))
+                trip_0 = trips.iloc[0]
+                acts.append(Activity(a_id(0), p.p_weight, p.Index, 0, trip_0.sd_group.source(), trip_0.departure, 0, 0,
+                             TripMode.OTHER, trip_0.from_location, trip_0.from_zone))
 
             for i in range(len(trips) - 1):
                 t0 = trips.iloc[i]
@@ -279,8 +295,8 @@ def a_id(t_i):
                 if duration < 0 or t0.gis_length < 0:
                     valid = False
 
-                acts.append(Activity(a_id(i + 1), p.Index, i + 1, t0.purpose,
-                                     duration, t0.gis_length, t0.duration, t0.main_mode))
+                acts.append(Activity(a_id(i + 1), p.p_weight, p.Index, i + 1, t0.purpose,duration, t0.gis_length,
+                                     t0.duration, t0.main_mode, t0.to_location, t0.to_zone))
 
             if len(trips) > 1:
                 i += 1
@@ -291,8 +307,8 @@ def a_id(t_i):
                     valid = False
 
                 # Duration is set to rest of day
-                acts.append(
-                    Activity(a_id(i + 1), p.Index, i + 1, tl.purpose, 1440, tl.gis_length, tl.duration, tl.main_mode))
+                acts.append(Activity(a_id(i + 1), p.p_weight, p.Index, i + 1, tl.purpose, 1440, tl.gis_length, tl.duration, tl.main_mode,
+                                     tl.to_location, tl.to_zone))
 
             if valid:
                 res.extend(acts)

From 4de5a32b4229efe68b0aa55ef9a59a5984edafd4 Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Mon, 10 Jun 2024 17:04:49 +0200
Subject: [PATCH 4/7] calibration cli and script to run multiple simulations at
 once

---
 matsim/calibration/__init__.py        |   1 +
 matsim/calibration/__main__.py        |  39 ++++-----
 matsim/calibration/run_create_csv.py  |  35 ++++++++
 matsim/calibration/run_simulations.py | 121 ++++++++++++++++++++++++++
 setup.py                              |   1 +
 5 files changed, 174 insertions(+), 23 deletions(-)
 create mode 100644 matsim/calibration/run_create_csv.py
 create mode 100644 matsim/calibration/run_simulations.py

diff --git a/matsim/calibration/__init__.py b/matsim/calibration/__init__.py
index d999ae6..a0fcb19 100644
--- a/matsim/calibration/__init__.py
+++ b/matsim/calibration/__init__.py
@@ -213,6 +213,7 @@ def f(trial):
 
         if os.name != 'nt':
             cmd = cmd.split(" ")
+            cmd = [c for c in cmd if c != ""]
 
         p = subprocess.Popen(cmd,
                              stdout=sys.stdout if debug else subprocess.DEVNULL,
diff --git a/matsim/calibration/__main__.py b/matsim/calibration/__main__.py
index 576f745..08cc4b1 100644
--- a/matsim/calibration/__main__.py
+++ b/matsim/calibration/__main__.py
@@ -1,36 +1,29 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import optuna
-import traceback
+from . import run_create_csv
+from . import run_simulations
 
-from . import study_as_df
+def _add(subparsers, m):
+    """ Adds module to as subcommand"""
+    s1 = subparsers.add_parser(m.METADATA[0], help=m.METADATA[1])
+    m.setup(s1)
+    s1.set_defaults(func=m.main)
 
 if __name__ == "__main__":
     import argparse
 
-    parser = argparse.ArgumentParser(prog="matsim-calibration", description="Calibration CLI")
-    parser.add_argument('file', nargs=1, type=str, help="Path to input db")
-    parser.add_argument("--name", type=str, default="calib", help="Calibration name")
-    parser.add_argument("--output", default=None, help="Output path")
-    args = parser.parse_args()
-
-    study = optuna.load_study(
-        study_name=args.name,
-        storage="sqlite:///%s" % args.file[0],
-    )
+    parser = argparse.ArgumentParser(prog="matsim-calibration", description="MATSim calibration command line utility")
 
-    if not args.output:
-        args.output = args.file[0] + ".csv"
+    subparsers = parser.add_subparsers(title="Subcommands")
 
-    df = study_as_df(study)
-    df.to_csv(args.output, index=False)
+    _add(subparsers, run_create_csv)
+    _add(subparsers, run_simulations)
 
-    try:
-        from .plot import plot_study
-        plot_study(study)
+    args = parser.parse_args()
 
-    except ImportError:
-        print("Could not plot study.")
-        traceback.print_exc()
+    if not hasattr(args, 'func'):
+        parser.print_help()
+    else:
+        args.func(args)
 
diff --git a/matsim/calibration/run_create_csv.py b/matsim/calibration/run_create_csv.py
new file mode 100644
index 0000000..ce6d1d0
--- /dev/null
+++ b/matsim/calibration/run_create_csv.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+
+METADATA = "create-csv", "Create plots and csv from calibration study."
+
+def setup(parser: argparse.ArgumentParser):
+    parser.add_argument('file', nargs=1, type=str, help="Path to input db")
+    parser.add_argument("--name", type=str, default="calib", help="Calibration name")
+    parser.add_argument("--output", default=None, help="Output path")
+
+def main(args):
+
+    import optuna
+    from . import study_as_df
+
+    study = optuna.load_study(
+        study_name=args.name,
+        storage="sqlite:///%s" % args.file[0],
+    )
+
+    if not args.output:
+        args.output = args.file[0] + ".csv"
+
+    df = study_as_df(study)
+    df.to_csv(args.output, index=False)
+
+    try:
+        from .plot import plot_study
+        plot_study(study)
+
+    except ImportError:
+        print("Could not plot study.")
+        traceback.print_exc()
\ No newline at end of file
diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py
new file mode 100644
index 0000000..30b88db
--- /dev/null
+++ b/matsim/calibration/run_simulations.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import os
+import subprocess
+import sys
+from os import makedirs
+from time import sleep
+from typing import Union, Callable
+
+METADATA = "run-simulations", "Utility to run multiple simulations at once."
+
+
+def process_results(directory):
+    """Process results of multiple simulations"""
+
+    print("Processing results in %s" % directory)
+
+
+def run(jar: Union[str, os.PathLike],
+        config: Union[str, os.PathLike],
+        args: Union[str, Callable] = "",
+        jvm_args="",
+        runs: int = 10,
+        worker_id: int = 0,
+        workers: int = 1,
+        seed: int = 4711,
+        overwrite: bool = False,
+        custom_cli: Callable = None,
+        debug: bool = False):
+    """Run multiple simulations using different seeds at once. Simulations will be performed sequentially.
+    For parallel execution, multiple workers must be started. 
+
+    :param jar: path to executable jar file of the scenario
+    :param config: path to config file to run
+    :param args: arguments to pass to the simulation
+    :param jvm_args: arguments to pass to the JVM
+    :param runs: number of simulations to run
+    :param worker_id: id of this process
+    :param workers: total number of processes
+    :param seed: starting seed
+    :param overwrite: overwrite existing output directory
+    :param custom_cli: use custom command line interface
+    :param debug: if true, output will be printed to console
+    """
+    if not os.access(jar, os.R_OK):
+        raise ValueError("Can not access JAR File: %s" % jar)
+
+    if not os.access(config, os.R_OK):
+        raise ValueError("Can not access config File: %s" % config)
+
+    if not os.path.exists("eval-runs"):
+        makedirs("eval-runs")
+
+    for i in range(runs):
+        if i % workers != worker_id:
+            continue
+
+        run_dir = "eval-runs/%03d" % i
+
+        if os.path.exists(run_dir) and not overwrite:
+            print("Run %s already exists, skipping." % run_dir)
+            continue
+
+        run_args = args(completed) if callable(args) else args
+
+        # Same custom cli interface as calibration
+        if custom_cli:
+            cmd = custom_cli(jvm_args, jar, config, params_path, run_dir, trial.number, run_args)
+        else:
+            cmd = "java %s -jar %s run --config %s --output %s --runId %03d --config:global.randomSeed=%d %s" \
+                  % (jvm_args, jar, config, run_dir, i, seed + i, run_args)
+
+        # Extra whitespaces will break argument parsing
+        cmd = cmd.strip()
+
+        print("Running cmd %s" % cmd)
+
+        if os.name != 'nt':
+            cmd = cmd.split(" ")
+            cmd = [c for c in cmd if c != ""]
+
+        p = subprocess.Popen(cmd,
+                             stdout=sys.stdout if debug else subprocess.DEVNULL,
+                             stderr=sys.stderr if debug else subprocess.DEVNULL)
+
+        try:
+            while p.poll() is None:
+                sleep(1)
+
+            if p.returncode != 0:
+                print("The scenario could not be run properly and returned with an error code.", file=sys.stderr)
+                if not debug:
+                    print("Set debug=True and check the output for any errors.", file=sys.stderr)
+                    print("Alternatively run the cmd from the log above manually and check for errors.",
+                          file=sys.stderr)
+
+                raise Exception("Process returned with error code: %s." % p.returncode)
+        finally:
+            p.terminate()
+
+    process_results("eval_runs")
+
+
+def setup(parser: argparse.ArgumentParser):
+    parser.add_argument("--jar", type=str, required=True, help="Path to executable JAR file")
+    parser.add_argument("--config", type=str, required=True, help="Path to config file")
+    parser.add_argument("--args", type=str, default="", help="Arguments to pass to the simulation")
+    parser.add_argument("--jvm-args", type=str, default="", help="Arguments to pass to the JVM")
+    parser.add_argument("--runs", type=int, default=10, help="Number of simulations to run")
+    parser.add_argument("--worker-id", type=int, default=0, help="ID of this worker")
+    parser.add_argument("--workers", type=int, default=1, help="Total number of workers")
+    parser.add_argument("--seed", type=int, default=4711, help="Starting seed")
+    parser.add_argument("--overwrite", action="store_true", help="Overwrite existing output directories")
+    parser.add_argument("--debug", action="store_true", help="Print output to console")
+
+
+def main(args):
+    run(args.jar, args.config, args.args, args.jvm_args, args.runs, args.worker_id, args.workers, args.seed,
+        args.overwrite, debug=args.debug)
diff --git a/setup.py b/setup.py
index c575579..70609a4 100644
--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,7 @@
         'console_scripts': [
             'matsim-tools=matsim.cli.main:main',
             'matsim-scenariogen=matsim.scenariogen:main',
+            'matsim-calibration=matsim.calibration:main',
             'matsim-viz=matsim.viz:main',
         ]
     },

From 51168f4c6486769c8c2e05ddecc734575e8b9050 Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Mon, 10 Jun 2024 17:10:05 +0200
Subject: [PATCH 5/7] fix linting

---
 matsim/calibration/run_create_csv.py  | 1 +
 matsim/calibration/run_simulations.py | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/matsim/calibration/run_create_csv.py b/matsim/calibration/run_create_csv.py
index ce6d1d0..c4a89e3 100644
--- a/matsim/calibration/run_create_csv.py
+++ b/matsim/calibration/run_create_csv.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import argparse
+import traceback
 
 METADATA = "create-csv", "Create plots and csv from calibration study."
 
diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py
index 30b88db..4b09093 100644
--- a/matsim/calibration/run_simulations.py
+++ b/matsim/calibration/run_simulations.py
@@ -50,6 +50,9 @@ def run(jar: Union[str, os.PathLike],
     if not os.access(config, os.R_OK):
         raise ValueError("Can not access config File: %s" % config)
 
+    if worker_id >= workers:
+        raise ValueError("Worker ID must be smaller than number of workers (starts at 0).")
+
     if not os.path.exists("eval-runs"):
         makedirs("eval-runs")
 
@@ -63,11 +66,11 @@ def run(jar: Union[str, os.PathLike],
             print("Run %s already exists, skipping." % run_dir)
             continue
 
-        run_args = args(completed) if callable(args) else args
+        run_args = args(i) if callable(args) else args
 
-        # Same custom cli interface as calibration
+        # Similar custom cli interface as calibration
         if custom_cli:
-            cmd = custom_cli(jvm_args, jar, config, params_path, run_dir, trial.number, run_args)
+            cmd = custom_cli(jvm_args, jar, config, run_dir, i, seed + i, run_args)
         else:
             cmd = "java %s -jar %s run --config %s --output %s --runId %03d --config:global.randomSeed=%d %s" \
                   % (jvm_args, jar, config, run_dir, i, seed + i, run_args)

From 916a0571b70e95b4f00e2cd60c7bcdd6d0742ff2 Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Thu, 13 Jun 2024 20:28:29 +0200
Subject: [PATCH 6/7] implemented log-likelihood calculation

---
 matsim/calibration/run_simulations.py | 55 +++++++++++++++++++++++++--
 setup.py                              |  2 +-
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py
index 4b09093..5660f07 100644
--- a/matsim/calibration/run_simulations.py
+++ b/matsim/calibration/run_simulations.py
@@ -9,13 +9,62 @@
 from time import sleep
 from typing import Union, Callable
 
+import pandas as pd
+import numpy as np
+
 METADATA = "run-simulations", "Utility to run multiple simulations at once."
 
 
-def process_results(directory):
+def process_results(runs):
     """Process results of multiple simulations"""
+    from sklearn.metrics import log_loss
+    from sklearn.preprocessing import LabelEncoder
+
+    print("Processing results in %s" % runs)
+
+    dfs = None
+    for run in os.listdir(runs):
+        if not os.path.isdir(os.path.join(runs, run)):
+            continue
+        print("Processing run %s" % run)
+
+        df = pd.read_csv(os.path.join(runs, run, "analysis", "population", "mode_choices.csv"))
+        if dfs is None:
+            dfs = df
+        else:
+            dfs= dfs.merge(df, left_on=["person", "n", "true_mode"], right_on=["person", "n", "true_mode"], suffixes=("", "_%s" % run))
+
+    shares = dfs.groupby("true_mode").size() / len(dfs)
+    modes = shares.index
+
+    labels = LabelEncoder().fit(modes)
+    y_true = labels.transform(dfs["true_mode"])
+    y_null = np.tile(shares.to_numpy(), reps=(len(y_true), 1))
+    y_pred = np.zeros((len(y_true), len(modes)))
+    dists = dfs.euclidean_distance.to_numpy() / 1000
+
+    pred_cols = [c for c in dfs.columns if c.startswith("pred_mode")]
+    for p in dfs[pred_cols].itertuples():
+
+        for j, m in enumerate(modes):
+            c = 0
+            for col in pred_cols:
+               if getattr(p, col) == m:
+                     c += 1
+
+            y_pred[p.Index, j] = c / len(pred_cols)
+
+    result = [
+        ("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=False), -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=False)),
+        ("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=True), -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=True)),
+        ("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, normalize=False), -log_loss(y_true, y_null, sample_weight=dfs.weight * dists, normalize=False)),
+        ("Samples", len(dfs), sum(dists)),
+    ]
+
+    df = pd.DataFrame(result, columns=["Metric", "Value", "Distance weighted"]).set_index("Metric")
+    print(df)
 
-    print("Processing results in %s" % directory)
+    df.to_csv(os.path.join(runs, "results.csv"), index=True)
 
 
 def run(jar: Union[str, os.PathLike],
@@ -103,7 +152,7 @@ def run(jar: Union[str, os.PathLike],
         finally:
             p.terminate()
 
-    process_results("eval_runs")
+    process_results("eval-runs")
 
 
 def setup(parser: argparse.ArgumentParser):
diff --git a/setup.py b/setup.py
index 70609a4..e12a2fe 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@
         "pandas >= 2.1.0",
     ],
     extras_require={
-        'calibration': ["optuna >= 3.3.0", "shapely >= 1.8.0", "geopandas >= 0.6.0"],
+        'calibration': ["optuna >= 3.3.0", "shapely >= 1.8.0", "geopandas >= 0.6.0", "scikit-learn"],
         # m2cgen has problems with newer xgb, see this issue
         # https://github.com/BayesWitnesses/m2cgen/issues/581
         'scenariogen': ["sumolib", "traci", "lxml", "optax", "requests", "tqdm", "scikit-learn", "xgboost==1.7.1", "lightgbm",

From b431704988869729c3d332c9116adddd1503471a Mon Sep 17 00:00:00 2001
From: rakow <rakow@vsp.tu-berlin.de>
Date: Sat, 15 Jun 2024 15:13:59 +0200
Subject: [PATCH 7/7] add accuracy score

---
 matsim/calibration/run_simulations.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py
index 5660f07..7a7607d 100644
--- a/matsim/calibration/run_simulations.py
+++ b/matsim/calibration/run_simulations.py
@@ -17,7 +17,7 @@
 
 def process_results(runs):
     """Process results of multiple simulations"""
-    from sklearn.metrics import log_loss
+    from sklearn.metrics import log_loss, accuracy_score
     from sklearn.preprocessing import LabelEncoder
 
     print("Processing results in %s" % runs)
@@ -26,7 +26,6 @@ def process_results(runs):
     for run in os.listdir(runs):
         if not os.path.isdir(os.path.join(runs, run)):
             continue
-        print("Processing run %s" % run)
 
         df = pd.read_csv(os.path.join(runs, run, "analysis", "population", "mode_choices.csv"))
         if dfs is None:
@@ -54,11 +53,16 @@ def process_results(runs):
 
             y_pred[p.Index, j] = c / len(pred_cols)
 
+    accs = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight) for col in pred_cols]
+    accs_d = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight * dists) for col in pred_cols]
+
     result = [
         ("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=False), -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=False)),
         ("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=True), -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=True)),
+        ("Mean Accuracy", np.mean(accs), np.mean(accs_d)),
         ("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, normalize=False), -log_loss(y_true, y_null, sample_weight=dfs.weight * dists, normalize=False)),
         ("Samples", len(dfs), sum(dists)),
+        ("Runs", len(pred_cols), len(pred_cols))
     ]
 
     df = pd.DataFrame(result, columns=["Metric", "Value", "Distance weighted"]).set_index("Metric")