diff --git a/config/base.toml b/config/base.toml index 410dc11..dde6447 100644 --- a/config/base.toml +++ b/config/base.toml @@ -37,6 +37,19 @@ optional_columns = [ ] n_matches = 10 # What is the maximum number of NTS matches we want for each SPC household? +[feasible_assignment] +# `actual_distance = distance * (1 + ((detour_factor - 1) * np.exp(-decay_rate * distance)))` +# +# `detour factor` when converting Euclidean distance to actual travel distance +detour_factor = 1.56 + +# `decay rate` is the inverse of the distance (in units of the data, e.g. metres) at which the +# scaling from using the detour factor to Euclidean distance reduces by `exp(−1)`. +# +# 0.0001 is a good value for Leeds when units are metres, choice of decay_rate can be explored in an +# [interactive plot](https://www.wolframalpha.com/input?i=plot+exp%28-0.0001x%29+from+x%3D0+to+x%3D50000) +decay_rate = 0.0001 + [work_assignment] commute_level = "OA" use_percentages = true # if true, optimization problem will try to minimize percentage difference at OD level (not absolute numbers). Recommended to set it to true diff --git a/config/base_all_msoa.toml b/config/base_all_msoa.toml new file mode 100644 index 0000000..4593485 --- /dev/null +++ b/config/base_all_msoa.toml @@ -0,0 +1,78 @@ +[parameters] +seed = 0 + # this is used to query poi data from osm and to load in SPC data +region = "leeds" +# how many people from the SPC do we want to run the model for? Comment out if you want to run the analysis on the entire SPC populaiton +number_of_households = 25000 +# "OA21CD": OA level, "MSOA11CD": MSOA level +zone_id = "MSOA21CD" + # Only set to true if you have travel time matrix at the level specified in boundary_geography +travel_times = false +boundary_geography = "MSOA" +# NTS years to use +nts_years = [2019, 2021, 2022] +# NTS regions to use +nts_regions = [ + 'Yorkshire and the Humber', + 'North West', + 'North East', + 'East Midlands', + 'West Midlands', + 'East of England', + 'South East', + 'South West'] +# nts day of the week to use +# 1: Monday, 2: Tuesday, 3: Wednesday, 4: Thursday, 5: Friday, 6: Saturday, 7: Sunday +nts_day_of_week = 3 +# what crs do we want the output to be in? (just add the number, e.g. 3857) +output_crs = 3857 + +[matching] +# for optional and required columns, see the [iterative_match_categorical](https://github.com/Urban-Analytics-Technology-Platform/acbm/blob/ca181c54d7484ebe44706ff4b43c26286b22aceb/src/acbm/matching.py#L110) function +# Do not add any column not listed below. You can only move a column from optional to require (or vise versa) +required_columns = [ + "number_adults", + "number_children", + "num_pension_age", +] +optional_columns = [ + "number_cars", + "rural_urban_2_categories", + "employment_status", + "tenure_status", +] +# What is the maximum number of NTS matches we want for each SPC household? +n_matches = 10 + +[feasible_assignment] +# detour factor when converting euclidian distance to actual travel distance +detour_factor = 1.56 +# decay rate when converting euclidian to travel distance (0.0001 is a good value) +# actual_distance = distance * (1 + ((detour_factor - 1) * np.exp(-decay_rate * distance))) +decay_rate = 0.0001 + +[work_assignment] +commute_level = "MSOA" +# if true, optimization problem will try to minimize percentage difference at OD level (not absolute numbers). Recommended to set it to true +use_percentages = true +# weights to add for each objective in the optimization problem +weight_max_dev = 0.2 +weight_total_dev = 0.8 +# maximum number of feasible zones to include in the optimization problem (less zones makes problem smaller - so faster, but at the cost of a better solution) +max_zones = 10 + +[postprocessing] +pam_jitter = 30 +pam_min_duration = 10 +# for get_pt_subscription: everyone above this age has a subscription (pensioners get free travel) +# TODO: more sophisticated approach +pt_subscription_age = 66 +# to define if a person is a student: +# eveyone below this age is a student +student_age_base = 16 +# everyone below this age that has at least one "education" activity is a student +student_age_upper = 30 +# eveyone who uses one of the modes below is classified as a passenger (isPassenger = True) +modes_passenger = ['car_passenger', 'taxi'] +# yearly state pension: for getting hhlIncome of pensioners +state_pension = 11502 diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py index 613e06f..392be8a 100644 --- a/scripts/3.1_assign_primary_feasible_zones.py +++ b/scripts/3.1_assign_primary_feasible_zones.py @@ -74,7 +74,11 @@ def main(config_file): logger.info("Creating estimated travel times matrix") # Create a new travel time matrix based on distances between zones travel_time_estimates = zones_to_time_matrix( - zones=boundaries, id_col=config.zone_id, time_units="m" + zones=boundaries, + id_col=config.zone_id, + time_units="m", + detour_factor=config.feasible_assignment.detour_factor, + decay_rate=config.feasible_assignment.decay_rate, ) logger.info("Travel time estimates created") @@ -203,6 +207,8 @@ def main(config_file): time_tolerance=config.parameters.tolerance_edu if config.parameters.tolerance_edu is not None else 0.3, + detour_factor=config.feasible_assignment.detour_factor, + decay_rate=config.feasible_assignment.decay_rate, ) logger.info("Saving feasible zones for education activities") @@ -230,6 +236,8 @@ def main(config_file): time_tolerance=config.parameters.tolerance_work if config.parameters.tolerance_work is not None else 0.3, + detour_factor=config.feasible_assignment.detour_factor, + decay_rate=config.feasible_assignment.decay_rate, ) logger.info("Saving feasible zones for work activities") diff --git a/scripts/3.3_assign_facility_all.py b/scripts/3.3_assign_facility_all.py index 2099b9b..ba73bdf 100644 --- a/scripts/3.3_assign_facility_all.py +++ b/scripts/3.3_assign_facility_all.py @@ -307,7 +307,10 @@ def main(config_file): x_col="TripDisIncSW", y_col="length", x_label="Reported Travel Distance (km)", - y_label="Actual Distance - Euclidian (km)", + y_label="Actual Distance - Estimated (km)", + detour_factor=config.feasible_assignment.detour_factor, + decay_rate=config.feasible_assignment.decay_rate, + x_axis_max=50, crs=f"EPSG:{config.output_crs}", title_prefix=f"Scatter plot of TripDisIncSW vs. Length for {activity_type}", save_dir=config.output_path / "plots/assigning/", @@ -330,8 +333,11 @@ def main(config_file): activity_type_col="destination activity", x_col="TripTotalTime", y_col="length", - x_label="Reported Travel TIme (min)", - y_label="Actual Distance - Euclidian (km)", + x_label="Reported Travel Time (min)", + y_label="Actual Distance - Estimated (km)", + detour_factor=config.feasible_assignment.detour_factor, + decay_rate=config.feasible_assignment.decay_rate, + x_axis_max=180, crs=f"EPSG:{config.output_crs}", title_prefix="Scatter plot of TripTotalTime vs. Length", save_dir=config.output_path / "plots/assigning/", diff --git a/scripts/4_validation.py b/scripts/4_validation.py index ab469cf..ba246ce 100644 --- a/scripts/4_validation.py +++ b/scripts/4_validation.py @@ -217,6 +217,8 @@ def main(config_file): start_wkt_col="start_location_geometry_wkt", end_wkt_col="end_location_geometry_wkt", crs_epsg=config.output_crs, + detour_factor=1.56, + decay_rate=0.0001, ) # Plot: Aggregate diff --git a/src/acbm/assigning/feasible_zones_primary.py b/src/acbm/assigning/feasible_zones_primary.py index 9c29807..9ef9f26 100644 --- a/src/acbm/assigning/feasible_zones_primary.py +++ b/src/acbm/assigning/feasible_zones_primary.py @@ -77,6 +77,8 @@ def get_possible_zones( travel_times: Optional[pd.DataFrame] = None, filter_by_activity: bool = False, time_tolerance: float = 0.2, + detour_factor: float = 1.56, + decay_rate: float = 0.0001, ) -> dict: """ Get possible zones for all activity chains in the dataset. This function loops over the travel_times dataframe and filters by mode, time of day and weekday/weekend. @@ -106,6 +108,13 @@ def get_possible_zones( activity chain's travel time (which is stored in "TripTotalTime"). Allowable travel_times are those that fall in the range of: travel_time_reported * (1 - time_tolerance) <= travel_time_reported <= travel_time_reported * (1 + time_tolerance) Default = 0.2 + detour_factor: float + The detour factor used to calculate travel distances between zones from euclidian distances. + Default = 1.56 + decay_rate: float + The decay rate used to calculate travel times between zones from travel distances. Detours make up a smaller portion of + longer distance trips. Default = 0.0001 + Returns ------- @@ -137,7 +146,11 @@ def get_possible_zones( if travel_times is None: logger.info("Travel time matrix not provided: Creating travel times estimates") travel_times = zones_to_time_matrix( - zones=boundaries, id_col=zone_id, time_units="m" + zones=boundaries, + id_col=zone_id, + time_units="m", + detour_factor=detour_factor, + decay_rate=decay_rate, ) list_of_modes = activity_chains["mode"].unique() @@ -224,7 +237,7 @@ def get_possible_zones( travel_times_filtered_mode_time_day = travel_times_filtered_mode # if mode = car_passenger, we compare to the travel time for car (we don't # have travel times for car_passenger) - if mode == "car_passenger": + if mode in ("car_passenger", "taxi"): activity_chains_filtered = activity_chains[ (activity_chains["mode"] == "car") ] diff --git a/src/acbm/assigning/plots.py b/src/acbm/assigning/plots.py index 80a7bd1..c401d2b 100644 --- a/src/acbm/assigning/plots.py +++ b/src/acbm/assigning/plots.py @@ -11,6 +11,8 @@ from matplotlib import patches as mpatches from shapely.geometry import LineString +from acbm.assigning.utils import _adjust_distance + def plot_workzone_assignment_line( assignment_results: pd.DataFrame, @@ -387,7 +389,10 @@ def plot_scatter_actual_reported( title_prefix: str, activity_type: str, activity_type_col: str, + detour_factor: float, + decay_rate: float, crs: str, + x_axis_max: float, save_dir: str | Path | None = None, display: bool = False, y_scale: float = 1 / 1000, @@ -405,7 +410,10 @@ def plot_scatter_actual_reported( - title_prefix: Prefix for the plot titles. - activity_type: Type of activity to plot. - activity_type_col: Column name for the activity type. + - detour_factor: Detour factor for the distance adjustment. (see `_adjust_distance`) + - decay_rate: Decay rate for the distance adjustment. - crs: Coordinate reference system (CRS) of the activities data. + - x_axis_max: Maximum value for the x-axis. (use to remove very high reported distances / times) - save_dir: Directory to save the plots. If None, plots are not saved. - display: Whether to display plots by calling `plt.show()`. """ @@ -434,6 +442,13 @@ def plot_scatter_actual_reported( # calculate the length of the line_geometry in meters activity_chains_plot["length"] = activity_chains_plot["line_geometry"].length + # Create a column for estimated distance + activity_chains_plot["length"] = activity_chains_plot["length"].apply( + lambda d: _adjust_distance( + d, detour_factor=detour_factor, decay_rate=decay_rate + ) + ) + # Calculate the number of rows and columns for the subplots. It is a function of the number of modes nrows = math.ceil(len(activity_chains_plot["mode"].unique()) / 2) ncols = 2 @@ -460,6 +475,10 @@ def plot_scatter_actual_reported( p = np.poly1d(z) ax.plot(subset_mode[x_col], p(subset_mode[x_col]), "r--") + # Set x-axis limit + if x_axis_max: + ax.set_xlim(0, x_axis_max) + ax.set_title(f"{title_prefix} for mode: {mode}") ax.set_xlabel(x_label) ax.set_ylabel(y_label) diff --git a/src/acbm/assigning/utils.py b/src/acbm/assigning/utils.py index c829cc1..d866d9e 100644 --- a/src/acbm/assigning/utils.py +++ b/src/acbm/assigning/utils.py @@ -286,10 +286,39 @@ def _get_activities_per_zone_df(activities_per_zone: dict) -> pd.DataFrame: return pd.concat(activities_per_zone.values()) +def _adjust_distance( + distance: float, + detour_factor: float, + decay_rate: float, +) -> float: + """ + Adjusts euclidian distances by adding a detour factor. We use minkowski distance + and a decay rate, as longer detour makes up a smaller proportion of the total + distance as the distance increases. + + Parameters + ---------- + distance : float + The original distance. + detour_factor : float + The detour factor to be applied. + decay_rate : float + The decay rate to be applied. + + Returns + ------- + float + The adjusted distance. + """ + return distance * (1 + ((detour_factor - 1) * np.exp(-decay_rate * distance))) + + def zones_to_time_matrix( zones: gpd.GeoDataFrame, time_units: str, id_col: Optional[str] = None, + detour_factor: float = 1.56, + decay_rate: float = 0.0001, ) -> pd.DataFrame: """ Calculates the distance matrix between the centroids of the given zones and returns it as a DataFrame. The matrix also adds @@ -307,6 +336,8 @@ def zones_to_time_matrix( The name of the column in the zones GeoDataFrame to use as the ID. If None, the index values are used. Default is None. time_units: str, optional The units to use for the travel time. Options are 's' for seconds and 'm' for minutes. + detour_factor: float, optional + The detour factor to apply to the distance. Default is 1.56. Returns ------- pd.DataFrame @@ -323,6 +354,11 @@ def zones_to_time_matrix( distances = distances.stack().reset_index() distances.columns = [f"{id_col}_from", f"{id_col}_to", "distance"] + # adjust distance column by adding a detour factor + distances["distance"] = distances["distance"].apply( + lambda d: _adjust_distance(d, detour_factor, decay_rate) + ) + # replace the index values with the id_col values if id_col is specified if id_col is not None: # create a mapping from index to id_col values @@ -449,7 +485,8 @@ def intrazone_time(zones: gpd.GeoDataFrame, key_column: str) -> dict: # Calculate the area of each zone zones["area"] = zones["geometry"].area # Calculate the average distance within each zone - zones["average_dist"] = np.sqrt(zones["area"]) / 2 + # sqrt(area) / 2 would be radius + zones["average_dist"] = np.sqrt(zones["area"]) / 1.5 # Mode speeds in m/s mode_speeds_mps = { diff --git a/src/acbm/config.py b/src/acbm/config.py index 1bb1e4e..477add7 100644 --- a/src/acbm/config.py +++ b/src/acbm/config.py @@ -40,6 +40,12 @@ class MatchingParams(BaseModel): chunk_size: int = 50_000 +@dataclass(frozen=True) +class FeasibleAssignmentParams(BaseModel): + detour_factor: float + decay_rate: float + + @dataclass(frozen=True) class WorkAssignmentParams(BaseModel): use_percentages: bool @@ -84,10 +90,13 @@ def serialize_filepath(self, filepath: Path | str | None) -> str | None: class Config(BaseModel): parameters: Parameters = Field(description="Config: parameters.") + matching: MatchingParams = Field(description="Config: parameters for matching.") + feasible_assignment: FeasibleAssignmentParams = Field( + description="Config: parameters for assignment of feasible zones." + ) work_assignment: WorkAssignmentParams = Field( description="Config: parameters for work assignment." ) - matching: MatchingParams = Field(description="Config: parameters for matching.") postprocessing: Postprocessing = Field( description="Config: parameters for postprocessing." ) diff --git a/src/acbm/validating/utils.py b/src/acbm/validating/utils.py index 3ea2b40..dfc999f 100644 --- a/src/acbm/validating/utils.py +++ b/src/acbm/validating/utils.py @@ -2,6 +2,8 @@ import pandas as pd from shapely import wkt +from acbm.assigning.utils import _adjust_distance + def process_sequences( df: pd.DataFrame, @@ -76,6 +78,8 @@ def calculate_od_distances( end_wkt_col: str, crs_epsg: int, projected_epsg: int = 3857, + detour_factor: float = 1.56, + decay_rate: float = 0.0001, ) -> pd.DataFrame: """ Calculate distances between start and end geometries in a DataFrame. @@ -94,6 +98,11 @@ def calculate_od_distances( projected_epsg: int EPSG code for the projected CRS (default is 3857). We need a metric crs to calculte distances in meters. + detour_factor: float + Factor to adjust the estimated distance. + decay_rate: float + Decay rate for the distance adjustment. Detours are a smaller proportion of + the direct distance for longer trips. Returns ------- @@ -120,7 +129,17 @@ def calculate_od_distances( gdf = gdf.to_crs(epsg=projected_epsg) end_gdf = end_gdf.to_crs(epsg=projected_epsg) - # Calculate the distance between start and end geometries (in km) - gdf["distance"] = round(gdf.geometry.distance(end_gdf.geometry) / 1000, 1) + # Calculate the distance between start and end geometries (in m) + gdf["distance"] = gdf.geometry.distance(end_gdf.geometry) + + # Estimate actual travel distance + gdf["distance"] = gdf["distance"].apply( + lambda d: _adjust_distance( + d, detour_factor=detour_factor, decay_rate=decay_rate + ) + ) + + # convert distance to km + gdf["distance"] = round(gdf["distance"] / 1000, 1) return gdf diff --git a/tests/test_assigning.py b/tests/test_assigning.py index 76775cd..8cfa1f0 100644 --- a/tests/test_assigning.py +++ b/tests/test_assigning.py @@ -1,7 +1,8 @@ +import numpy as np import pandas as pd import pytest -from acbm.assigning.utils import _map_day_to_wkday_binary +from acbm.assigning.utils import _adjust_distance, _map_day_to_wkday_binary # applying to a single value @@ -32,3 +33,17 @@ def test_map_day_to_wkday_binary_df_invalid(): df = pd.DataFrame({"day": [1, 2, 3, 4, 5, 6, 7, 8]}) with pytest.raises(ValueError, match="Day should be numeric and in the range 1-7"): df["wkday"] = df["day"].apply(_map_day_to_wkday_binary) + + +def test_adjust_distance(): + assert np.isclose(_adjust_distance(0, 1.5, 0.1), 0) + assert np.isclose(_adjust_distance(10000, 1.56, 0.0001), 12060.125) + assert np.isclose( + _adjust_distance(10000, 1.56, 0.0001), + 10000 * (1 + ((1.56 - 1) * np.exp(-0.0001 * 10000))), + ) + assert np.isclose(_adjust_distance(50000, 1.8, 0.0002), 50001.81599) + assert np.isclose( + _adjust_distance(50000, 1.8, 0.0002), + 50000 * (1 + ((1.8 - 1) * np.exp(-0.0002 * 50000))), + ) diff --git a/tests/test_config.py b/tests/test_config.py index 5ac5b86..759816e 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -11,4 +11,4 @@ def config(): def test_id(config): - assert config.id == "e4589718b2" + assert config.id == "0ebb8c3ee7"