Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revise travel day #82

Open
wants to merge 19 commits into
base: 53-paths-with-79
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/base.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ nts_regions = [
]
# nts day of the week to use
# 1: Monday, 2: Tuesday, 3: Wednesday, 4: Thursday, 5: Friday, 6: Saturday, 7: Sunday
nts_day_of_week = 3
nts_days_of_week = [3]
# what crs do we want the output to be in? (just add the number, e.g. 3857)
output_crs = 3857

Expand Down
48 changes: 45 additions & 3 deletions scripts/2_match_households_and_individuals.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from acbm.assigning.utils import cols_for_assignment_all
from acbm.cli import acbm_cli
from acbm.config import load_and_setup_config
from acbm.matching import MatcherExact, match_individuals
from acbm.matching import MatcherExact, match_individuals, match_remaining_individuals
from acbm.preprocessing import (
count_per_group,
nts_filter_by_region,
Expand All @@ -16,6 +16,10 @@
transform_by_group,
truncate_values,
)
from acbm.utils import (
households_with_common_travel_days,
households_with_travel_days_in_nts_weeks,
)


@acbm_cli
Expand Down Expand Up @@ -222,23 +226,48 @@ def get_interim_path(

logger.info("Filtering NTS data by specified year(s)")

logger.info(f"Total NTS households: {nts_households.shape[0]:,.0f}")
years = config.parameters.nts_years

nts_individuals = nts_filter_by_year(nts_individuals, psu, years)
nts_households = nts_filter_by_year(nts_households, psu, years)
nts_trips = nts_filter_by_year(nts_trips, psu, years)

logger.info(
f"Total NTS households (after year filtering): {nts_households.shape[0]:,.0f}"
)
# #### Filter by geography
#

regions = config.parameters.nts_regions

nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)
nts_households = nts_filter_by_region(nts_households, psu, regions)
nts_trips = nts_filter_by_region(nts_trips, psu, regions)

# Create dictionaries of key value pairs
logger.info(
f"Total NTS households (after region filtering): {nts_households.shape[0]:,.0f}"
)

# Ensure that the households have at least one day in `nts_days_of_week` that
# all household members have trips for
if config.parameters.common_household_day:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new parameter boolean common_household_day determines whether all individuals of the household need to have a TravDay in common.

hids = households_with_common_travel_days(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gets the subset of households where all individuals have a common TravDay that is in the set of configured days (config.parameters.nts_days_of_week)

nts_trips, config.parameters.nts_days_of_week
)
else:
hids = households_with_travel_days_in_nts_weeks(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gets the subset of households where all individuals have any TravDay that is in the set of configured days (config.parameters.nts_days_of_week)

nts_trips, config.parameters.nts_days_of_week
)

# Subset individuals and households given filtering of trips
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Subset to the households subset above before matching to ensure matches have required TravDays

nts_trips = nts_trips[
nts_trips["HouseholdID"].isin(hids)
& nts_trips["TravDay"].isin(config.parameters.nts_days_of_week)
]
nts_individuals = nts_individuals[nts_individuals["HouseholdID"].isin(hids)]
nts_households = nts_households[nts_households["HouseholdID"].isin(hids)]

# Create dictionaries of key value pairs
"""
guide to the dictionaries:

Expand Down Expand Up @@ -924,6 +953,19 @@ def get_interim_path(
show_progress=True,
)

# match remaining individuals
remaining_ids = spc_edited.loc[
~spc_edited.index.isin(matches_ind.keys()), "id"
].to_list()
matches_remaining_ind = match_remaining_individuals(
df1=spc_edited,
df2=nts_individuals,
matching_columns=["age_group", "sex"],
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could update the matching_columns here to enable more precision when not using households: e.g. for employment status and urban rural classification.

remaining_ids=remaining_ids,
show_progress=True,
)
matches_ind.update(matches_remaining_ind)

Comment on lines +956 to +968
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add matching for any remaining individuals that were part of unmatched households. It might be worth considering if this should be more configurable.

# save random sample
with open(
get_interim_path("matches_ind_level_categorical_random_sample.pkl"), "wb"
Expand Down
13 changes: 9 additions & 4 deletions scripts/3.1_assign_primary_feasible_zones.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from acbm.assigning.utils import (
activity_chains_for_assignment,
get_activities_per_zone,
get_chosen_day,
intrazone_time,
replace_intrazonal_travel_time,
zones_to_time_matrix,
Expand All @@ -28,11 +29,15 @@ def main(config_file):
activity_chains = activity_chains_for_assignment(config)
logger.info("Activity chains loaded")

# Filter to a specific day of the week
logger.info("Filtering activity chains to a specific day of the week")
activity_chains = activity_chains[
activity_chains["TravDay"] == config.parameters.nts_day_of_week
]

# Generate random sample of days by household
get_chosen_day(config).to_parquet(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Get a chosen day for each individual to represent a "sample" day given the configured days of the week and whether the household is configured to share a common day.

config.output_path / "interim" / "assigning" / "chosen_trav_day.parquet"
)

# Filter to chosen day
activity_chains = activity_chains_for_assignment(config, subset_to_chosen_day=True)

# --- Study area boundaries

Expand Down
5 changes: 1 addition & 4 deletions scripts/3.2.1_assign_primary_zone_edu.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,8 @@ def main(config_file):
logger.info("Loading activity chains")

activity_chains = activity_chains_for_assignment(
config, columns=cols_for_assignment_edu()
config, columns=cols_for_assignment_edu(), subset_to_chosen_day=True
)
activity_chains = activity_chains[
activity_chains["TravDay"] == config.parameters.nts_day_of_week
]

logger.info("Filtering activity chains for trip purpose: education")
activity_chains_edu = activity_chains[activity_chains["dact"] == "education"]
Expand Down
8 changes: 3 additions & 5 deletions scripts/3.2.2_assign_primary_zone_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,9 @@ def main(config_file):

# --- Activity chains
logger.info("Loading activity chains")

activity_chains = activity_chains_for_assignment(config, cols_for_assignment_work())
activity_chains = activity_chains[
activity_chains["TravDay"] == config.parameters.nts_day_of_week
]
activity_chains = activity_chains_for_assignment(
config, cols_for_assignment_work(), subset_to_chosen_day=True
)

logger.info("Filtering activity chains for trip purpose: work")
activity_chains_work = activity_chains[activity_chains["dact"] == "work"]
Expand Down
5 changes: 1 addition & 4 deletions scripts/3.2.3_assign_secondary_zone.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,7 @@ def main(config_file):
# --- Load in the data
logger.info("Loading: activity chains")

activity_chains = activity_chains_for_assignment(config)
activity_chains = activity_chains[
activity_chains["TravDay"] == config.parameters.nts_day_of_week
]
activity_chains = activity_chains_for_assignment(config, subset_to_chosen_day=True)

# TODO: remove obsolete comment
# --- Add OA21CD to the data
Expand Down
2 changes: 1 addition & 1 deletion scripts/4_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def main(config_file):
# NTS data
legs_nts = pd.read_parquet(config.output_path / "nts_trips.parquet")

legs_nts = legs_nts[legs_nts["TravDay"] == config.parameters.nts_day_of_week]
legs_nts = legs_nts[legs_nts["TravDay"].isin(config.parameters.nts_days_of_week)]

# Model outputs
legs_acbm = pd.read_csv(config.output_path / "legs.csv")
Expand Down
2 changes: 1 addition & 1 deletion src/acbm/assigning/feasible_zones_primary.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
activity_chains_schema = DataFrameSchema(
{
"mode": Column(str),
"TravDay": Column(pa.Float, Check.isin([1, 2, 3, 4, 5, 6, 7]), nullable=True),
# "TravDay": Column(pa.Float, Check.isin([1, 2, 3, 4, 5, 6, 7]), nullable=True),
"tst": Column(pa.Float, Check.less_than_or_equal_to(1440), nullable=True),
"TripTotalTime": Column(pa.Float, nullable=True),
# TODO: add more columns ...
Expand Down
132 changes: 128 additions & 4 deletions src/acbm/assigning/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import geopandas as gpd
import numpy as np
import pandas as pd
import polars as pl

from acbm.config import Config

Expand All @@ -11,25 +12,26 @@ def cols_for_assignment_all() -> list[str]:
"""Gets activity chains with subset of columns required for assignment."""
return [
*cols_for_assignment_edu(),
"household",
"oact",
"nts_ind_id",
"nts_hh_id",
"age_years",
"TripDisIncSW",
"tet",
"DayID",
]


def cols_for_assignment_edu() -> list[str]:
"""Gets activity chains with subset of columns required for assignment."""
return [
"id",
"household",
"TravDay",
"OA11CD",
"dact",
"mode",
"tst",
"id",
"seq",
"TripTotalTime",
"education_type",
Expand All @@ -42,16 +44,26 @@ def cols_for_assignment_work() -> list[str]:


def activity_chains_for_assignment(
config: Config, columns: list[str] | None = None
config: Config, columns: list[str] | None = None, subset_to_chosen_day: bool = False
) -> pd.DataFrame:
"""Gets activity chains with subset of columns required for assignment."""
if columns is None:
columns = cols_for_assignment_all()

return pd.read_parquet(
activity_chains = pd.read_parquet(
config.spc_with_nts_trips_filepath,
columns=columns,
)
if not subset_to_chosen_day:
return activity_chains

return activity_chains.merge(
pd.read_parquet(
config.output_path / "interim" / "assigning" / "chosen_trav_day.parquet"
),
on=["id", "TravDay"],
how="inner",
)


def _map_time_to_day_part(
Expand Down Expand Up @@ -562,3 +574,115 @@ def replace_intrazonal_travel_time(

# Return the modified DataFrame
return travel_times_copy


def get_chosen_day(config: Config) -> pd.DataFrame:
"""Gets the chosen day for population given config."""
acs = pl.DataFrame(activity_chains_for_assignment(config))

if config.parameters.common_household_day:
return (
acs.join(
acs.group_by("household")
.agg(pl.col("TravDay").unique().sample(1, with_replacement=True))
.explode("TravDay"),
on=["household", "TravDay"],
how="inner",
)
.select(["id", "TravDay"])
.unique()
.sort("id")
.to_pandas()
)

# For any TravDay and modelling increased households
work_days = (
acs.filter(pl.col("dact").eq("work"))
.group_by("id")
.agg(pl.col("TravDay").unique())
.select(["id", pl.col("TravDay").list.drop_nulls().list.sample(n=1)])
.explode("TravDay")
.rename({"TravDay": "TravDayWork"})
)
non_work_days = (
acs.filter(~pl.col("dact").eq("work"))
.group_by("id")
.agg(pl.col("TravDay").unique())
.select(["id", pl.col("TravDay").list.drop_nulls().list.sample(n=1)])
.explode("TravDay")
.rename({"TravDay": "TravDayNonWork"})
)

any_days = (
acs.group_by("id")
.agg(pl.col("TravDay").unique())
.select(["id", pl.col("TravDay").list.drop_nulls()])
.select(
[
"id",
pl.when(pl.col("TravDay").list.len() > 0)
# Note: this has to be set to with_replacement despite non-empty check
.then(pl.col("TravDay").list.sample(n=1, with_replacement=True))
.otherwise(None),
]
)
.explode("TravDay")
.rename({"TravDay": "TravDayAny"})
).sort("id")

# Combine day choices for different conditions
acs_combine = (
acs.join(work_days, on="id", how="left", coalesce=True)
.join(non_work_days, on="id", how="left", coalesce=True)
.join(any_days, on="id", how="left", coalesce=True)
.join(
pl.scan_parquet(config.spc_combined_filepath)
.select(["id", "pwkstat"])
.collect(),
on="id",
)
)

# Choose a day given pwkstat
acs_combine = acs_combine.with_columns(
[
# If pwkstat = 1 (full time)
# and a work travel day is available
pl.when(pl.col("pwkstat").eq(1) & pl.col("TravDayWork").is_not_null())
.then(pl.col("TravDayWork"))
.otherwise(
# If pwkstat = 1 (full time)
# and a work travel day is NOT available
pl.when(pl.col("pwkstat").eq(1) & pl.col("TravDayWork").is_null())
.then(pl.col("TravDayAny"))
.otherwise(
# If pwkstat = 2 (part time)
# and a work travel day is available
# and a non-work travel day is available
pl.when(
pl.col("pwkstat").eq(2)
& pl.col("TravDayWork").is_not_null()
& pl.col("TravDayNonWork").is_not_null()
)
.then(
# Sample either TravDayWork or TravDayNonWork
# stochastically given config
pl.col("TravDayWork")
# TODO: update from config
if np.random.random() < 1
else pl.col("TravDayNonWork")
)
.otherwise(pl.col("TravDayAny"))
)
)
.alias("ChosenTravDay")
Comment on lines +649 to +678
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Samples a chosen day given an individual's pwkstat value to increase the likelihood of choosing a day that includes a work trip.

]
)

return (
acs_combine.select(["id", "ChosenTravDay"])
.unique()
.rename({"ChosenTravDay": "TravDay"})
.sort("id")
.to_pandas()
)
7 changes: 6 additions & 1 deletion src/acbm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import geopandas as gpd
import jcs
import numpy as np
import polars as pl
import tomlkit
from pydantic import BaseModel, Field, field_serializer, field_validator

Expand All @@ -26,10 +27,12 @@ class Parameters(BaseModel):
boundary_geography: str
nts_years: list[int]
nts_regions: list[str]
nts_day_of_week: int
nts_days_of_week: list[int]
output_crs: int
tolerance_work: float | None = None
tolerance_edu: float | None = None
common_household_day: bool = True
part_time_work_prob: float = 0.7


@dataclass(frozen=True)
Expand Down Expand Up @@ -359,6 +362,8 @@ def init_rng(self):
try:
np.random.seed(self.seed)
random.seed(self.seed)
pl.set_random_seed(self.seed)

except Exception as err:
msg = f"config does not provide a rng seed with err: {err}"
raise ValueError(msg) from err
Expand Down
Loading
Loading