Skip to content

Commit

Permalink
Merge branch 'master' into dist-calibration
Browse files Browse the repository at this point in the history
  • Loading branch information
rakow authored Jun 18, 2024
2 parents 209a70b + b834c3e commit 66eaf12
Show file tree
Hide file tree
Showing 26 changed files with 1,848 additions and 121 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.

### [0.0.19](https://github.com/matsim-vsp/matsim-python-tools/compare/v0.0.18...v0.0.19) (2024-06-18)

### [0.0.18](https://github.com/matsim-vsp/matsim-python-tools/compare/v0.0.16...v0.0.18) (2024-02-26)

### [0.0.17](https://github.com/matsim-vsp/matsim-python-tools/compare/v0.0.16...v0.0.17) (2024-02-26)
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.18
0.0.19
2 changes: 2 additions & 0 deletions matsim/Network.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ def read_network(filename, skip_attributes=False):
# TODO: pandas will make the value column "object" since we're mixing types
if 'class' in elem.attrib:
if elem.attrib['class'] == 'java.lang.Long':
atts['value'] = int(elem.text)
if elem.attrib['class'] == 'java.lang.Double':
atts['value'] = float(elem.text)
if elem.attrib['class'] == 'java.lang.Integer':
atts['value'] = int(elem.text)
Expand Down
1 change: 1 addition & 0 deletions matsim/calibration/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ def f(trial):

if os.name != 'nt':
cmd = cmd.split(" ")
cmd = [c for c in cmd if c != ""]

p = subprocess.Popen(cmd,
stdout=sys.stdout if debug else subprocess.DEVNULL,
Expand Down
39 changes: 16 additions & 23 deletions matsim/calibration/__main__.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,29 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import optuna
import traceback
from . import run_create_csv
from . import run_simulations

from . import study_as_df
def _add(subparsers, m):
""" Adds module to as subcommand"""
s1 = subparsers.add_parser(m.METADATA[0], help=m.METADATA[1])
m.setup(s1)
s1.set_defaults(func=m.main)

if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(prog="matsim-calibration", description="Calibration CLI")
parser.add_argument('file', nargs=1, type=str, help="Path to input db")
parser.add_argument("--name", type=str, default="calib", help="Calibration name")
parser.add_argument("--output", default=None, help="Output path")
args = parser.parse_args()

study = optuna.load_study(
study_name=args.name,
storage="sqlite:///%s" % args.file[0],
)
parser = argparse.ArgumentParser(prog="matsim-calibration", description="MATSim calibration command line utility")

if not args.output:
args.output = args.file[0] + ".csv"
subparsers = parser.add_subparsers(title="Subcommands")

df = study_as_df(study)
df.to_csv(args.output, index=False)
_add(subparsers, run_create_csv)
_add(subparsers, run_simulations)

try:
from .plot import plot_study
plot_study(study)
args = parser.parse_args()

except ImportError:
print("Could not plot study.")
traceback.print_exc()
if not hasattr(args, 'func'):
parser.print_help()
else:
args.func(args)

5 changes: 3 additions & 2 deletions matsim/calibration/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,9 @@ def read_trips_and_persons(run, transform_persons=None, transform_trips=None) ->
trips = glob.glob(run.rstrip("/") + "/*.output_trips.csv.gz")[0]
persons = glob.glob(run.rstrip("/") + "/*.output_persons.csv.gz")[0]

df = pd.read_csv(trips, sep=";", dtype={"person": "str"}, low_memory=False)
dfp = pd.read_csv(persons, sep=";", dtype={"person": "str"}, low_memory=False)

df = pd.read_csv(trips, sep=None, dtype={"person": "str"}, low_memory=False)
dfp = pd.read_csv(persons, sep=None, dtype={"person": "str"}, low_memory=False)

gdf = geopandas.GeoDataFrame(dfp,
geometry=geopandas.points_from_xy(dfp.first_act_x, dfp.first_act_y)
Expand Down
36 changes: 36 additions & 0 deletions matsim/calibration/run_create_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import traceback

METADATA = "create-csv", "Create plots and csv from calibration study."

def setup(parser: argparse.ArgumentParser):
parser.add_argument('file', nargs=1, type=str, help="Path to input db")
parser.add_argument("--name", type=str, default="calib", help="Calibration name")
parser.add_argument("--output", default=None, help="Output path")

def main(args):

import optuna
from . import study_as_df

study = optuna.load_study(
study_name=args.name,
storage="sqlite:///%s" % args.file[0],
)

if not args.output:
args.output = args.file[0] + ".csv"

df = study_as_df(study)
df.to_csv(args.output, index=False)

try:
from .plot import plot_study
plot_study(study)

except ImportError:
print("Could not plot study.")
traceback.print_exc()
196 changes: 196 additions & 0 deletions matsim/calibration/run_simulations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import os
import subprocess
import sys
from os import makedirs
from time import sleep
from typing import Union, Callable

import numpy as np
import pandas as pd

METADATA = "run-simulations", "Utility to run multiple simulations at once."


def likelihood_ratio(ll, ll_null):
return (2 * (ll - ll_null))


def likelihood_ratio_test(ll, ll_null, dof=1):
from scipy.stats.distributions import chi2
return chi2.sf(likelihood_ratio(ll, ll_null), dof)


def process_results(runs):
"""Process results of multiple simulations"""
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder

print("Processing results in %s" % runs)

dfs = None
for run in os.listdir(runs):
if not os.path.isdir(os.path.join(runs, run)):
continue

df = pd.read_csv(os.path.join(runs, run, "analysis", "population", "mode_choices.csv"))
if dfs is None:
dfs = df
else:
dfs = dfs.merge(df, left_on=["person", "n", "true_mode"], right_on=["person", "n", "true_mode"],
suffixes=("", "_%s" % run))

shares = dfs.groupby("true_mode").size() / len(dfs)
modes = shares.index

labels = LabelEncoder().fit(modes)
y_true = labels.transform(dfs["true_mode"])
y_null = np.tile(shares.to_numpy(), reps=(len(y_true), 1))
y_pred = np.zeros((len(y_true), len(modes)))
dists = dfs.euclidean_distance.to_numpy() / 1000

pred_cols = [c for c in dfs.columns if c.startswith("pred_mode")]
for p in dfs[pred_cols].itertuples():

for j, m in enumerate(modes):
c = 0
for col in pred_cols:
if getattr(p, col) == m:
c += 1

y_pred[p.Index, j] = c / len(pred_cols)

accs = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight) for col in pred_cols]
accs_d = [accuracy_score(dfs.true_mode, dfs[col], sample_weight=dfs.weight * dists) for col in pred_cols]

result = [
("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=False),
-log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=False)),
("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=True),
-log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=True)),
("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, normalize=False),
-log_loss(y_true, y_null, sample_weight=dfs.weight * dists, normalize=False)),
("Mean Accuracy", np.mean(accs), np.mean(accs_d)),
("Samples", len(dfs), sum(dists)),
("Runs", len(pred_cols), len(pred_cols))
]

result.insert(4, ("McFadden R2", 1 - (result[0][1] / result[2][1]), 1 - (result[0][2] / result[2][2])))
result.insert(5, ("LL ratio", likelihood_ratio(result[0][1], result[2][1]),
likelihood_ratio(result[0][2], result[2][2])))
result.insert(6, ("LL ratio test (dof=1)", likelihood_ratio_test(result[0][1], result[2][1]),
likelihood_ratio_test(result[0][2], result[2][2])))

df = pd.DataFrame(result, columns=["Metric", "Value", "Distance weighted"]).set_index("Metric")
print(df)

df.to_csv(os.path.join(runs, "results.csv"), index=True)


def run(jar: Union[str, os.PathLike],
config: Union[str, os.PathLike],
args: Union[str, Callable] = "",
jvm_args="",
runs: int = 10,
worker_id: int = 0,
workers: int = 1,
seed: int = 4711,
overwrite: bool = False,
custom_cli: Callable = None,
debug: bool = False):
"""Run multiple simulations using different seeds at once. Simulations will be performed sequentially.
For parallel execution, multiple workers must be started.
:param jar: path to executable jar file of the scenario
:param config: path to config file to run
:param args: arguments to pass to the simulation
:param jvm_args: arguments to pass to the JVM
:param runs: number of simulations to run
:param worker_id: id of this process
:param workers: total number of processes
:param seed: starting seed
:param overwrite: overwrite existing output directory
:param custom_cli: use custom command line interface
:param debug: if true, output will be printed to console
"""
if not os.access(jar, os.R_OK):
raise ValueError("Can not access JAR File: %s" % jar)

if not os.access(config, os.R_OK):
raise ValueError("Can not access config File: %s" % config)

if worker_id >= workers:
raise ValueError("Worker ID must be smaller than number of workers (starts at 0).")

if not os.path.exists("eval-runs"):
makedirs("eval-runs")

for i in range(runs):
if i % workers != worker_id:
continue

run_dir = "eval-runs/%03d" % i

if os.path.exists(run_dir) and not overwrite:
print("Run %s already exists, skipping." % run_dir)
continue

run_args = args(i) if callable(args) else args

# Similar custom cli interface as calibration
if custom_cli:
cmd = custom_cli(jvm_args, jar, config, run_dir, i, seed + i, run_args)
else:
cmd = "java %s -jar %s run --config %s --output %s --runId %03d --config:global.randomSeed=%d %s" \
% (jvm_args, jar, config, run_dir, i, seed + i, run_args)

# Extra whitespaces will break argument parsing
cmd = cmd.strip()

print("Running cmd %s" % cmd)

if os.name != 'nt':
cmd = cmd.split(" ")
cmd = [c for c in cmd if c != ""]

p = subprocess.Popen(cmd,
stdout=sys.stdout if debug else subprocess.DEVNULL,
stderr=sys.stderr if debug else subprocess.DEVNULL)

try:
while p.poll() is None:
sleep(1)

if p.returncode != 0:
print("The scenario could not be run properly and returned with an error code.", file=sys.stderr)
if not debug:
print("Set debug=True and check the output for any errors.", file=sys.stderr)
print("Alternatively run the cmd from the log above manually and check for errors.",
file=sys.stderr)

raise Exception("Process returned with error code: %s." % p.returncode)
finally:
p.terminate()

process_results("eval-runs")


def setup(parser: argparse.ArgumentParser):
parser.add_argument("--jar", type=str, required=True, help="Path to executable JAR file")
parser.add_argument("--config", type=str, required=True, help="Path to config file")
parser.add_argument("--args", type=str, default="", help="Arguments to pass to the simulation")
parser.add_argument("--jvm-args", type=str, default="", help="Arguments to pass to the JVM")
parser.add_argument("--runs", type=int, default=10, help="Number of simulations to run")
parser.add_argument("--worker-id", type=int, default=0, help="ID of this worker")
parser.add_argument("--workers", type=int, default=1, help="Total number of workers")
parser.add_argument("--seed", type=int, default=4711, help="Starting seed")
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing output directories")
parser.add_argument("--debug", action="store_true", help="Print output to console")


def main(args):
run(args.jar, args.config, args.args, args.jvm_args, args.runs, args.worker_id, args.workers, args.seed,
args.overwrite, debug=args.debug)
Loading

0 comments on commit 66eaf12

Please sign in to comment.