Skip to content

Commit

Permalink
Nonrevenue trips only need to be handled in pre Dec 2023 data (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
devinmatte authored Nov 27, 2024
1 parent 714ce29 commit 9e5ae18
Show file tree
Hide file tree
Showing 4 changed files with 601 additions and 545 deletions.
9 changes: 5 additions & 4 deletions mbta-performance/chalicelib/lamp/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
}

# if a trip_id begins with NONREV-, it is not revenue producing and thus not something we want to benchmark
# if an event has a trip_id begins with ADDED-, then a downstream process was unable to determine the scheduled trip
# that the vehicle is currently on (this can be due to AVL glitches, trip diversions, test train trips, etc.)
TRIP_IDS_TO_DROP = ("NONREV-",) # "ADDED-")
# but only if it happens before December 2023 as it's an unreliable indicator of actual revenue service.
# All trips after that are properly ignored by the LAMP system, and don't appear in the dataset anymore.
TRIP_IDS_TO_DROP = ("NONREV-",)

# defining these columns in particular becasue we use them everywhere
RTE_DIR_STOP = ["route_id", "direction_id", "stop_id"]
Expand Down Expand Up @@ -209,7 +209,8 @@ def ingest_pq_file(pq_df: pd.DataFrame, service_date: date) -> pd.DataFrame:
# from that which GTFS reports in its schedule. Replace for better schedule matching.
pq_df["stop_id"] = pq_df["stop_id"].replace(STOP_ID_NUMERIC_MAP)
# drop non-revenue producing events
pq_df = pq_df[~pq_df["trip_id"].str.startswith(TRIP_IDS_TO_DROP)]
cutoff_date = format_dateint(20231130)
pq_df = pq_df[~((pq_df["trip_id"].str.startswith(TRIP_IDS_TO_DROP)) & (pq_df["service_date"] < cutoff_date))]

processed_daily_events = _process_arrival_departure_times(pq_df)
processed_daily_events = processed_daily_events[processed_daily_events["stop_id"].notna()]
Expand Down
4 changes: 1 addition & 3 deletions mbta-performance/chalicelib/lamp/tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,11 @@ def test_ingest_pq_file(self):

with mock.patch("chalicelib.lamp.ingest.fetch_stop_times_from_gtfs", return_value=self.mock_gtfs_data):
pq_df_after = ingest.ingest_pq_file(pq_df_before, date(2024, 2, 7))
nonrev = pq_df_after[pq_df_after["trip_id"].str.startswith("NONREV-")]
added = pq_df_after[pq_df_after["trip_id"].str.startswith("ADDED-")]
null_id_events = pq_df_after[pq_df_after["stop_id"].isna()]
self.assertTrue(nonrev.empty)
self.assertEqual(added.shape, (3763, 17))
self.assertTrue(null_id_events.empty)
self.assertEqual(pq_df_after.shape, (16700, 17))
self.assertEqual(pq_df_after.shape, (17074, 17))
self.assertEqual(set(pq_df_after["service_date"].unique()), {"2024-02-07"})

def test__average_scheduled_headways(self):
Expand Down
Loading

0 comments on commit 9e5ae18

Please sign in to comment.