diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 689cfd8b..f8ecce8b 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -8,6 +8,8 @@ import gzip import hashlib import zipfile +import os +from datetime import datetime, timedelta from . import config, util, nextbus, routeconfig, timetables @@ -49,13 +51,18 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde 'after_index': best_index, # the index of the coordinate of the shape just before this stop 'offset': int(best_offset) # distance in meters between this stop and the closest line segment of shape } + -def download_gtfs_data(agency: config.Agency, gtfs_cache_dir): +def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_path=None): + cache_dir = Path(gtfs_cache_dir) + zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' gtfs_url = agency.gtfs_url + + if gtfs_url is None: raise Exception(f'agency {agency.id} does not have gtfs_url in config') - cache_dir = Path(gtfs_cache_dir) + if not cache_dir.exists(): print(f'downloading gtfs data from {gtfs_url}') r = requests.get(gtfs_url) @@ -63,14 +70,16 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir): if r.status_code != 200: raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}") - zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' - with open(zip_path, 'wb') as f: f.write(r.content) - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall(gtfs_cache_dir) + if gtfs_path is not None: + zip_path = gtfs_path + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(gtfs_cache_dir) + + def is_subsequence(smaller, bigger): smaller_len = len(smaller) bigger_len = len(bigger) @@ -108,15 +117,18 @@ def contains_excluded_stop(shape_stop_ids, excluded_stop_ids): return False class GtfsScraper: - def __init__(self, agency: config.Agency): + def __init__(self, agency: config.Agency, gtfs_path=None): self.agency = agency self.agency_id = agency_id = agency.id gtfs_cache_dir = f'{util.get_data_dir()}/gtfs-{agency_id}' - download_gtfs_data(agency, gtfs_cache_dir) - - self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) + get_gtfs_data(agency, gtfs_cache_dir, gtfs_path=gtfs_path) + if gtfs_path is None: + self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) + else: + self.feed = ptg.load_geo_feed(gtfs_path, {}) + self.errors = [] self.stop_times_by_trip = None self.stops_df = None @@ -261,7 +273,6 @@ def save_timetables(self, save_to_s3=False, skip_existing=False): agency_id = self.agency_id dates_map = self.get_services_by_date() - # # Typically, many dates have identical scheduled timetables (with times relative to midnight on that date). # Instead of storing redundant timetables for each date, store one timetable per route for each unique set of service_ids. @@ -1078,4 +1089,4 @@ def save_routes(self, save_to_s3, d): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3) + routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3, gtfs_date=d) diff --git a/backend/models/routeconfig.py b/backend/models/routeconfig.py index df664c8d..726a3f87 100644 --- a/backend/models/routeconfig.py +++ b/backend/models/routeconfig.py @@ -1,4 +1,5 @@ import re, os, time, requests, json, boto3, gzip +from pathlib import Path from . import util, config DefaultVersion = 'v3a' @@ -121,8 +122,13 @@ def get_directions_for_stop(self, stop_id): for s in direction['stops'] if s == stop_id ] -def get_cache_path(agency_id, version=DefaultVersion): - return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json' +def get_cache_path(agency_id, version=DefaultVersion, gtfs_date=None): + if gtfs_date == None: + return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json' + + return f'{util.get_data_dir()}/routes_{version}_{agency_id}-{gtfs_date}/routes_{version}_{agency_id}-{gtfs_date}.json' + + def get_s3_path(agency_id, version=DefaultVersion): return f'routes/{version}/routes_{version}_{agency_id}.json.gz' @@ -179,14 +185,17 @@ def get_route_config(agency_id, route_id, version=DefaultVersion): return route return None -def save_routes(agency_id, routes, save_to_s3=False): +def save_routes(agency_id, routes, save_to_s3=False, gtfs_date=None): data_str = json.dumps({ 'version': DefaultVersion, 'routes': [route.data for route in routes] }, separators=(',', ':')) - cache_path = get_cache_path(agency_id) - + cache_path = get_cache_path(agency_id, gtfs_date=gtfs_date) + cache_dir = Path(cache_path).parent + if not cache_dir.exists(): + cache_dir.mkdir(parents = True, exist_ok = True) + with open(cache_path, "w") as f: f.write(data_str) diff --git a/backend/save_routes.py b/backend/save_routes.py index fbfc13c5..ec9e43d1 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -1,7 +1,8 @@ -from models import gtfs, config +from models import gtfs, config, util from compute_stats import compute_stats_for_dates import argparse -from datetime import date +from datetime import date, datetime, timedelta +import os # Downloads and parses the GTFS specification # and saves the configuration for all routes to S3. @@ -32,9 +33,43 @@ #} # # -# Currently the script just overwrites the one S3 path, but this process could be extended in the future to -# store different paths for different dates, to allow fetching historical data for route configurations. -# +# When no date is provided the script just overwrites the one S3 path +# representing the recentmost GTFS that an agency has made available that # is active. Providing the date adds -YYYY-MM-DD to the routes path, +# which would allow the backend to use versioned route files. + + +def get_recentmost_date_qualified_gtfs_path(gtfs_date): + ''' + Find most recent zip file before gtfs_date. + recentmost_date_qualified_zip_file is: + "date qualified" and "recentmost" + + "date qualified" means the date of the file is no later than the date + argument given. + + "recentmost" means it is the most recent file that qualifies. + ''' + + recentmost_date_qualified_zip_file = "" + recentmost_date_qualified_date = gtfs_date + smallest_timedelta_so_far = timedelta.max + for candidate_zip_file in os.listdir(util.get_data_dir()): + if f'gtfs-{agency.id}-' in candidate_zip_file and '.zip' in candidate_zip_file: + candidate_year = candidate_zip_file.split('-')[2] + candidate_month = candidate_zip_file.split('-')[3] + candidate_day = candidate_zip_file.split('-')[4].split(".zip")[0] + candidate_date_string = candidate_year+"-"+candidate_month+"-"+candidate_day + candidate_date = datetime.strptime(candidate_date_string,"%Y-%m-%d").date() + if candidate_date - gtfs_date <= smallest_timedelta_so_far and candidate_date <= gtfs_date: + recentmost_date_qualified_date = candidate_date + recentmost_date_qualified_zip_file = candidate_zip_file + + gtfs_date = recentmost_date_qualified_date + gtfs_path = f'{util.get_data_dir()}/{recentmost_date_qualified_zip_file}' + if recentmost_date_qualified_zip_file == "": + print("an active GTFS for this date was not found") + raise SystemExit + return gtfs_path, gtfs_date if __name__ == '__main__': parser = argparse.ArgumentParser(description='Save route configuration from GTFS and possibly Nextbus API') @@ -42,23 +77,40 @@ parser.add_argument('--s3', dest='s3', action='store_true', help='store in s3') parser.add_argument('--timetables', dest='timetables', action='store_true', help='also save timetables') parser.add_argument('--scheduled-stats', dest='scheduled_stats', action='store_true', help='also compute scheduled stats if the timetable has new dates (requires --timetables)') + parser.add_argument('--date', required=False) parser.set_defaults(s3=False) parser.set_defaults(timetables=False) parser.set_defaults(scheduled_stats=False) + parser.set_defaults(gtfs_date=None) args = parser.parse_args() agencies = [config.get_agency(args.agency)] if args.agency is not None else config.agencies save_to_s3 = args.s3 - d = date.today() + gtfs_date = args.date errors = [] - + for agency in agencies: - scraper = gtfs.GtfsScraper(agency) - scraper.save_routes(save_to_s3, d) + + if gtfs_date is None: + # save the normal way, downloading the most recent GTFS file + gtfs_date=date.today() + gtfs_path = None + else: + # save with date suffix, using the GTFS file provided + gtfs_date=datetime.strptime(gtfs_date, "%Y-%m-%d").date() + gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' + + gtfs_path, gtfs_date = get_recentmost_date_qualified_gtfs_path(gtfs_date) + # save the routes + scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path) + scraper.save_routes(save_to_s3, gtfs_date) + errors += scraper.errors + + if args.timetables: timetables_updated = scraper.save_timetables(save_to_s3=save_to_s3, skip_existing=True) @@ -66,7 +118,6 @@ dates = sorted(scraper.get_services_by_date().keys()) compute_stats_for_dates(dates, agency, scheduled=True, save_to_s3=save_to_s3) - errors += scraper.errors if errors: raise Exception("\n".join(errors))