Merge branch 'master' into dist-calibration

matsim-vsp · Jun 18, 2024 · 66eaf12 · 66eaf12
2 parents 209a70b + b834c3e
commit 66eaf12
Show file tree

Hide file tree

Showing 26 changed files with 1,848 additions and 121 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
 
+### [0.0.19](https://github.com/matsim-vsp/matsim-python-tools/compare/v0.0.18...v0.0.19) (2024-06-18)
+
 ### [0.0.18](https://github.com/matsim-vsp/matsim-python-tools/compare/v0.0.16...v0.0.18) (2024-02-26)
 
 ### [0.0.17](https://github.com/matsim-vsp/matsim-python-tools/compare/v0.0.16...v0.0.17) (2024-02-26)

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.0.18
+0.0.19
diff --git a/matsim/Network.py b/matsim/Network.py
@@ -125,6 +125,8 @@ def read_network(filename, skip_attributes=False):
                 # TODO: pandas will make the value column "object" since we're mixing types
                 if 'class' in elem.attrib:
                     if elem.attrib['class'] == 'java.lang.Long':
+                        atts['value'] = int(elem.text)
+                    if elem.attrib['class'] == 'java.lang.Double':
                         atts['value'] = float(elem.text)
                     if elem.attrib['class'] == 'java.lang.Integer':
                         atts['value'] = int(elem.text)

diff --git a/matsim/calibration/__init__.py b/matsim/calibration/__init__.py
@@ -233,6 +233,7 @@ def f(trial):
 
         if os.name != 'nt':
             cmd = cmd.split(" ")
+            cmd = [c for c in cmd if c != ""]
 
         p = subprocess.Popen(cmd,
                              stdout=sys.stdout if debug else subprocess.DEVNULL,

diff --git a/matsim/calibration/__main__.py b/matsim/calibration/__main__.py
@@ -1,36 +1,29 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import optuna
-import traceback
+from . import run_create_csv
+from . import run_simulations
 
-from . import study_as_df
+def _add(subparsers, m):
+    """ Adds module to as subcommand"""
+    s1 = subparsers.add_parser(m.METADATA[0], help=m.METADATA[1])
+    m.setup(s1)
+    s1.set_defaults(func=m.main)
 
 if __name__ == "__main__":
     import argparse
 
-    parser = argparse.ArgumentParser(prog="matsim-calibration", description="Calibration CLI")
-    parser.add_argument('file', nargs=1, type=str, help="Path to input db")
-    parser.add_argument("--name", type=str, default="calib", help="Calibration name")
-    parser.add_argument("--output", default=None, help="Output path")
-    args = parser.parse_args()
-
-    study = optuna.load_study(
-        study_name=args.name,
-        storage="sqlite:///%s" % args.file[0],
-    )
+    parser = argparse.ArgumentParser(prog="matsim-calibration", description="MATSim calibration command line utility")
 
-    if not args.output:
-        args.output = args.file[0] + ".csv"
+    subparsers = parser.add_subparsers(title="Subcommands")
 
-    df = study_as_df(study)
-    df.to_csv(args.output, index=False)
+    _add(subparsers, run_create_csv)
+    _add(subparsers, run_simulations)
 
-    try:
-        from .plot import plot_study
-        plot_study(study)
+    args = parser.parse_args()
 
-    except ImportError:
-        print("Could not plot study.")
-        traceback.print_exc()
+    if not hasattr(args, 'func'):
+        parser.print_help()
+    else:
+        args.func(args)
 
diff --git a/matsim/calibration/analysis.py b/matsim/calibration/analysis.py
@@ -71,8 +71,9 @@ def read_trips_and_persons(run, transform_persons=None, transform_trips=None) ->
     trips = glob.glob(run.rstrip("/") + "/*.output_trips.csv.gz")[0]
     persons = glob.glob(run.rstrip("/") + "/*.output_persons.csv.gz")[0]
 
-    df = pd.read_csv(trips, sep=";", dtype={"person": "str"}, low_memory=False)
-    dfp = pd.read_csv(persons, sep=";", dtype={"person": "str"}, low_memory=False)
+
+    df = pd.read_csv(trips, sep=None, dtype={"person": "str"}, low_memory=False)
+    dfp = pd.read_csv(persons, sep=None, dtype={"person": "str"}, low_memory=False)
 
     gdf = geopandas.GeoDataFrame(dfp,
             geometry=geopandas.points_from_xy(dfp.first_act_x, dfp.first_act_y)

diff --git a/matsim/calibration/run_create_csv.py b/matsim/calibration/run_create_csv.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import traceback
+
+METADATA = "create-csv", "Create plots and csv from calibration study."
+
+def setup(parser: argparse.ArgumentParser):
+    parser.add_argument('file', nargs=1, type=str, help="Path to input db")
+    parser.add_argument("--name", type=str, default="calib", help="Calibration name")
+    parser.add_argument("--output", default=None, help="Output path")
+
+def main(args):
+
+    import optuna
+    from . import study_as_df
+
+    study = optuna.load_study(
+        study_name=args.name,
+        storage="sqlite:///%s" % args.file[0],
+    )
+
+    if not args.output:
+        args.output = args.file[0] + ".csv"
+
+    df = study_as_df(study)
+    df.to_csv(args.output, index=False)
+
+    try:
+        from .plot import plot_study
+        plot_study(study)
+
+    except ImportError:
+        print("Could not plot study.")
+        traceback.print_exc()
diff --git a/matsim/calibration/run_simulations.py b/matsim/calibration/run_simulations.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import os
+import subprocess
+import sys
+from os import makedirs
+from time import sleep
+from typing import Union, Callable
+
+import numpy as np
+import pandas as pd
+
+METADATA = "run-simulations", "Utility to run multiple simulations at once."
+
+
+def likelihood_ratio(ll, ll_null):
+    return (2 * (ll - ll_null))
+
+
+def likelihood_ratio_test(ll, ll_null, dof=1):
+    from scipy.stats.distributions import chi2
+    return chi2.sf(likelihood_ratio(ll, ll_null), dof)
+
+
+def process_results(runs):
+    """Process results of multiple simulations"""
+    from sklearn.metrics import log_loss, accuracy_score
+    from sklearn.preprocessing import LabelEncoder
+
+    print("Processing results in %s" % runs)
+
+    dfs = None
+    for run in os.listdir(runs):
+        if not os.path.isdir(os.path.join(runs, run)):
+            continue
+
+        df = pd.read_csv(os.path.join(runs, run, "analysis", "population", "mode_choices.csv"))
+        if dfs is None:
+            dfs = df
+        else:
+            dfs = dfs.merge(df, left_on=["person", "n", "true_mode"], right_on=["person", "n", "true_mode"],
+                            suffixes=("", "_%s" % run))
+
+    shares = dfs.groupby("true_mode").size() / len(dfs)
+    modes = shares.index
+
+    labels = LabelEncoder().fit(modes)
+    y_true = labels.transform(dfs["true_mode"])
+    y_null = np.tile(shares.to_numpy(), reps=(len(y_true), 1))
+    y_pred = np.zeros((len(y_true), len(modes)))
+    dists = dfs.euclidean_distance.to_numpy() / 1000
+
+    pred_cols = [c for c in dfs.columns if c.startswith("pred_mode")]
+    for p in dfs[pred_cols].itertuples():
+
+        for j, m in enumerate(modes):
+            c = 0
+            for col in pred_cols:
+                if getattr(p, col) == m:
+                    c += 1
+
+            y_pred[p.Index, j] = c / len(pred_cols)
+
+    accs = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight) for col in pred_cols]
+    accs_d = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight * dists) for col in pred_cols]
+
+    result = [
+        ("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=False),
+         -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=False)),
+        ("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=True),
+         -log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=True)),
+        ("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, normalize=False),
+         -log_loss(y_true, y_null, sample_weight=dfs.weight * dists, normalize=False)),
+        ("Mean Accuracy", np.mean(accs), np.mean(accs_d)),
+        ("Samples", len(dfs), sum(dists)),
+        ("Runs", len(pred_cols), len(pred_cols))
+    ]
+
+    result.insert(4, ("McFadden R2", 1 - (result[0][1] / result[2][1]), 1 - (result[0][2] / result[2][2])))
+    result.insert(5, ("LL ratio", likelihood_ratio(result[0][1], result[2][1]),
+                      likelihood_ratio(result[0][2], result[2][2])))
+    result.insert(6, ("LL ratio test (dof=1)", likelihood_ratio_test(result[0][1], result[2][1]),
+                      likelihood_ratio_test(result[0][2], result[2][2])))
+
+    df = pd.DataFrame(result, columns=["Metric", "Value", "Distance weighted"]).set_index("Metric")
+    print(df)
+
+    df.to_csv(os.path.join(runs, "results.csv"), index=True)
+
+
+def run(jar: Union[str, os.PathLike],
+        config: Union[str, os.PathLike],
+        args: Union[str, Callable] = "",
+        jvm_args="",
+        runs: int = 10,
+        worker_id: int = 0,
+        workers: int = 1,
+        seed: int = 4711,
+        overwrite: bool = False,
+        custom_cli: Callable = None,
+        debug: bool = False):
+    """Run multiple simulations using different seeds at once. Simulations will be performed sequentially.
+    For parallel execution, multiple workers must be started. 
+
+    :param jar: path to executable jar file of the scenario
+    :param config: path to config file to run
+    :param args: arguments to pass to the simulation
+    :param jvm_args: arguments to pass to the JVM
+    :param runs: number of simulations to run
+    :param worker_id: id of this process
+    :param workers: total number of processes
+    :param seed: starting seed
+    :param overwrite: overwrite existing output directory
+    :param custom_cli: use custom command line interface
+    :param debug: if true, output will be printed to console
+    """
+    if not os.access(jar, os.R_OK):
+        raise ValueError("Can not access JAR File: %s" % jar)
+
+    if not os.access(config, os.R_OK):
+        raise ValueError("Can not access config File: %s" % config)
+
+    if worker_id >= workers:
+        raise ValueError("Worker ID must be smaller than number of workers (starts at 0).")
+
+    if not os.path.exists("eval-runs"):
+        makedirs("eval-runs")
+
+    for i in range(runs):
+        if i % workers != worker_id:
+            continue
+
+        run_dir = "eval-runs/%03d" % i
+
+        if os.path.exists(run_dir) and not overwrite:
+            print("Run %s already exists, skipping." % run_dir)
+            continue
+
+        run_args = args(i) if callable(args) else args
+
+        # Similar custom cli interface as calibration
+        if custom_cli:
+            cmd = custom_cli(jvm_args, jar, config, run_dir, i, seed + i, run_args)
+        else:
+            cmd = "java %s -jar %s run --config %s --output %s --runId %03d --config:global.randomSeed=%d %s" \
+                  % (jvm_args, jar, config, run_dir, i, seed + i, run_args)
+
+        # Extra whitespaces will break argument parsing
+        cmd = cmd.strip()
+
+        print("Running cmd %s" % cmd)
+
+        if os.name != 'nt':
+            cmd = cmd.split(" ")
+            cmd = [c for c in cmd if c != ""]
+
+        p = subprocess.Popen(cmd,
+                             stdout=sys.stdout if debug else subprocess.DEVNULL,
+                             stderr=sys.stderr if debug else subprocess.DEVNULL)
+
+        try:
+            while p.poll() is None:
+                sleep(1)
+
+            if p.returncode != 0:
+                print("The scenario could not be run properly and returned with an error code.", file=sys.stderr)
+                if not debug:
+                    print("Set debug=True and check the output for any errors.", file=sys.stderr)
+                    print("Alternatively run the cmd from the log above manually and check for errors.",
+                          file=sys.stderr)
+
+                raise Exception("Process returned with error code: %s." % p.returncode)
+        finally:
+            p.terminate()
+
+    process_results("eval-runs")
+
+
+def setup(parser: argparse.ArgumentParser):
+    parser.add_argument("--jar", type=str, required=True, help="Path to executable JAR file")
+    parser.add_argument("--config", type=str, required=True, help="Path to config file")
+    parser.add_argument("--args", type=str, default="", help="Arguments to pass to the simulation")
+    parser.add_argument("--jvm-args", type=str, default="", help="Arguments to pass to the JVM")
+    parser.add_argument("--runs", type=int, default=10, help="Number of simulations to run")
+    parser.add_argument("--worker-id", type=int, default=0, help="ID of this worker")
+    parser.add_argument("--workers", type=int, default=1, help="Total number of workers")
+    parser.add_argument("--seed", type=int, default=4711, help="Starting seed")
+    parser.add_argument("--overwrite", action="store_true", help="Overwrite existing output directories")
+    parser.add_argument("--debug", action="store_true", help="Print output to console")
+
+
+def main(args):
+    run(args.jar, args.config, args.args, args.jvm_args, args.runs, args.worker_id, args.workers, args.seed,
+        args.overwrite, debug=args.debug)