Skip to content

Commit

Permalink
extended create ref data script
Browse files Browse the repository at this point in the history
  • Loading branch information
rakow committed Dec 26, 2023
1 parent f3b1bb9 commit 1b35bb7
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 28 deletions.
12 changes: 12 additions & 0 deletions matsim/scenariogen/data/preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@ def prepare_persons(hh, pp, tt, augment=5, max_hh_size=5, core_weekday=False, re

return df

def bins_to_labels(bins):
""" Convert bins to labels """
res = ["%.0f - %.0f" % (bins[i], bins[i + 1]) for i in range(len(bins) - 1)]

if bins[-1] == np.inf:
res[-1] = "%.0f+" % bins[-2]

return res

def cut(x, bins):
""" Cut x into bind and return labels """
return pd.cut(x, bins, labels=bins_to_labels(bins), right=False)

def augment_persons(pp, factor=1, permute_age=0.5):
""" Augment persons using p weight
Expand Down
95 changes: 67 additions & 28 deletions matsim/scenariogen/data/run_create_ref_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@
# -*- coding: utf-8 -*-

import argparse
from dataclasses import dataclass
from enum import Enum, auto
from typing import List, Union, Tuple

import numpy as np
import pandas as pd

from enum import Enum, auto

from . import *
from .preparation import _fill
from .preparation import cut

METADATA = "data-create-ref", "Extract and create reference data from surveys."


class InvalidHandling(Enum):
""" How to handle invalid trips. """
# Invalid trips are ignored
Expand All @@ -22,8 +24,20 @@ class InvalidHandling(Enum):
# Drop whole person if any trip is invalid
REMOVE_PERSONS = auto()


@dataclass
class AggregationResult:
""" Return value of create function. """

persons: pd.DataFrame
trips: pd.DataFrame
share: pd.DataFrame

groups: pd.DataFrame = None


def weighted(x):
data = dict(n=x.t_weight.sum(), mean_dist=np.average(x.gis_length * 1000, weights=x.t_weight))
data = dict(n=x.t_weight.sum())
return pd.Series(data=data)


Expand Down Expand Up @@ -83,8 +97,18 @@ def default_person_filter(df):

def create(survey_dirs, transform_persons, transform_trips,
invalid_trip_handling: InvalidHandling = InvalidHandling.REMOVE_TRIPS,
impute_modes=None):
""" Create reference data from survey data. """
dist_groups=[0, 1000, 2000, 5000, 10000, 20000, np.inf],
ref_groups: List[Union[str, Tuple[str]]] = None,
output_prefix="") -> AggregationResult:
""" Create reference data from survey data.
:param survey_dirs: Directories with survey data
:param transform_persons: Function to transform person data frame
:param transform_trips: Function to transform trip data frame
:param invalid_trip_handling: How to handle invalid trips
:param groups: Create reference data for these attribute groups
:param output_prefix: prefix for the ouput files
:return:
"""

all_hh, all_persons, all_trips = read_all(survey_dirs)

Expand All @@ -93,9 +117,6 @@ def create(survey_dirs, transform_persons, transform_trips,

persons = transform_persons(df) if transform_persons is not None else df

# TODO: configurable attributes
persons["age_group"] = pd.cut(persons.age, [0, 18, 66, np.inf], labels=["0 - 17", "18 - 65", "65+"], right=False)

if invalid_trip_handling == InvalidHandling.REMOVE_PERSONS:
# Filter persons, if they have at least one invalid trip
invalid = set(all_trips[~all_trips.valid].p_id)
Expand All @@ -110,16 +131,8 @@ def create(survey_dirs, transform_persons, transform_trips,
# Transform existing trips
trips = transform_trips(trips) if transform_trips is not None else trips

# Fill certain modes with distribution from existing
if impute_modes is not None:
for m in impute_modes:
_fill(trips, "main_mode", m)

# TODO: configurable dist binds
labels = ["0 - 1000", "1000 - 2000", "2000 - 5000", "5000 - 10000", "10000 - 20000", "20000+"]
bins = [0, 1000, 2000, 5000, 10000, 20000, np.inf]

trips["dist_group"] = pd.cut(trips.gis_length * 1000, bins, labels=labels, right=False)
# Set dist groups
trips["dist_group"] = cut(trips.gis_length * 1000, dist_groups)

aggr = trips.groupby(["dist_group", "main_mode"]).apply(weighted)

Expand All @@ -129,26 +142,52 @@ def create(survey_dirs, transform_persons, transform_trips,
share = aggr.drop(columns=["n"])
aggr = share.copy()

# TODO: configurable output

aggr.to_csv("mode_share_ref.csv")
aggr.to_csv(output_prefix + "mode_share_ref.csv")

# Also normalize der distance group
# Also normalize per distance group
for dist_group in aggr.index.get_level_values(0).categories:
sub = aggr.loc[dist_group, :]
sub.share /= sub.share.sum()

aggr.to_csv("mode_share_per_dist_ref.csv")
aggr.to_csv(output_prefix + "mode_share_per_dist_ref.csv")

aggr = summarize_purposes(trips)

aggr.to_csv("trip_purposes_by_hour_ref.csv")
aggr.to_csv(output_prefix + "trip_purposes_by_hour_ref.csv")

aggr = summarize_mode_usage(persons, trips)
aggr.to_csv("mode_users_ref.csv")
aggr.to_csv(output_prefix + "mode_users_ref.csv")

groups = None
if ref_groups:

groups = []

for g in ref_groups:

if type(g) is str:
g = [g]

for x in g:
if x not in persons.columns:
raise ValueError("Column %s not found in persons" % x)

aggr = trips.groupby(g + ["main_mode"]).apply(weighted)
aggr["share"] = aggr.n / aggr.n.sum()
aggr["share"].fillna(0, inplace=True)
aggr.drop(columns=["n"], inplace=True)

# todo only works with one subgroup level
# Normalize per group
for group in aggr.index.get_level_values(0).categories:
sub = aggr.loc[group, :]
sub.share /= sub.share.sum()

groups.append(aggr)

groups = pd.concat(groups, axis=1)

# TODO: ref data per attribute ?
return persons, trips, share.groupby("main_mode").sum().drop(columns=["mean_dist"])
return AggregationResult(persons, trips, share.groupby("main_mode").sum(), groups=groups)


def main(args):
Expand Down

0 comments on commit 1b35bb7

Please sign in to comment.