From 614bfe316d9e62098be406492f2597d7b6189be7 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 22 Apr 2020 14:59:02 -0700 Subject: [PATCH 01/55] renamed save_routes method to save_new_routes --- backend/models/gtfs.py | 4 ++-- backend/save_routes.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 689cfd8b..1df479df 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -1058,7 +1058,7 @@ def get_sort_key(route_data): return route_data['title'] return sorted(routes_data, key=get_sort_key) - def save_routes(self, save_to_s3, d): + def save_new_routes(self, save_to_s3, d): agency = self.agency agency_id = agency.id routes_df = self.get_gtfs_routes() @@ -1078,4 +1078,4 @@ def save_routes(self, save_to_s3, d): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3) + routeconfig.new_save_routes(agency_id, routes, save_to_s3=save_to_s3) diff --git a/backend/save_routes.py b/backend/save_routes.py index fbfc13c5..6f163dc1 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -57,7 +57,7 @@ for agency in agencies: scraper = gtfs.GtfsScraper(agency) - scraper.save_routes(save_to_s3, d) + scraper.save_new_routes(save_to_s3, d) if args.timetables: timetables_updated = scraper.save_timetables(save_to_s3=save_to_s3, skip_existing=True) From 857fe04fc3ba1821ad468412a8e7cee84f080a01 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 22 Apr 2020 15:13:07 -0700 Subject: [PATCH 02/55] fixed naming error with routeconfig.new_save_routes --- backend/models/gtfs.py | 2 +- backend/save_routes.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 1df479df..6e541fbd 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -1078,4 +1078,4 @@ def save_new_routes(self, save_to_s3, d): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - routeconfig.new_save_routes(agency_id, routes, save_to_s3=save_to_s3) + routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3) diff --git a/backend/save_routes.py b/backend/save_routes.py index 6f163dc1..ed97ad0d 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -52,6 +52,8 @@ save_to_s3 = args.s3 d = date.today() + import datetime + d = d + datetime.timedelta(days=3) errors = [] From c87b43e002741c435899d3f99f903d3ae67b9197 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 22 Apr 2020 15:45:40 -0700 Subject: [PATCH 03/55] added save_old_routes method --- backend/models/gtfs.py | 22 ++++++++++++++++++++++ backend/save_routes.py | 2 ++ 2 files changed, 24 insertions(+) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 6e541fbd..5a83d2fa 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -1079,3 +1079,25 @@ def save_new_routes(self, save_to_s3, d): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3) + + def save_old_routes(self, save_to_s3, d): + agency = self.agency + agency_id = agency.id + routes_df = self.get_gtfs_routes() + routes_df = self.get_active_routes(routes_df, d) + if len(routes_df) == 0: + self.errors.append(( + f'Zero active routes for {agency_id}, the routes config was not updated. ' + f'Ensure the GTFS is active for the given date {d}' + )) + return + + routes_data = [ + self.get_route_data(route) + for route in routes_df.itertuples() + ] + routes_data = self.sort_routes(routes_data) + + routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] + + routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3) diff --git a/backend/save_routes.py b/backend/save_routes.py index ed97ad0d..a9fd2ee3 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -52,6 +52,8 @@ save_to_s3 = args.s3 d = date.today() + # dont forget to take this out so the date is truly today + # this fake date is for testing import datetime d = d + datetime.timedelta(days=3) From 3817c06beb3921f50dc2461e1311937fc6f7f82e Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 22 Apr 2020 16:07:41 -0700 Subject: [PATCH 04/55] called save_old_routes after calling save_new_routes - doesnt save properly yet --- backend/models/gtfs.py | 2 +- backend/save_routes.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 5a83d2fa..b21a182c 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -1100,4 +1100,4 @@ def save_old_routes(self, save_to_s3, d): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3) + routeconfig.save_routes(agency_id, routes, save_to_s3=False) diff --git a/backend/save_routes.py b/backend/save_routes.py index a9fd2ee3..d8fe68f7 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -62,6 +62,7 @@ for agency in agencies: scraper = gtfs.GtfsScraper(agency) scraper.save_new_routes(save_to_s3, d) + scraper.save_old_routes(False, d) if args.timetables: timetables_updated = scraper.save_timetables(save_to_s3=save_to_s3, skip_existing=True) From 22cb4c33f9c954dcaca44146520de99e75f41a00 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 22 Apr 2020 23:26:37 -0700 Subject: [PATCH 05/55] some progress making a versioned cache dir for old route --- backend/models/gtfs.py | 5 +++-- backend/models/routeconfig.py | 32 +++++++++++++++++++++++++++----- backend/save_routes.py | 3 ++- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index b21a182c..e677cdb5 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -261,7 +261,8 @@ def save_timetables(self, save_to_s3=False, skip_existing=False): agency_id = self.agency_id dates_map = self.get_services_by_date() - + ##bri## print('here\n\n\n',dates_map) + ##bri## exit() # # Typically, many dates have identical scheduled timetables (with times relative to midnight on that date). # Instead of storing redundant timetables for each date, store one timetable per route for each unique set of service_ids. @@ -1100,4 +1101,4 @@ def save_old_routes(self, save_to_s3, d): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - routeconfig.save_routes(agency_id, routes, save_to_s3=False) + routeconfig.save_routes(agency_id, routes, save_to_s3=False, use_versioning=True, version_date=d) diff --git a/backend/models/routeconfig.py b/backend/models/routeconfig.py index df664c8d..7e8fa7ba 100644 --- a/backend/models/routeconfig.py +++ b/backend/models/routeconfig.py @@ -121,8 +121,17 @@ def get_directions_for_stop(self, stop_id): for s in direction['stops'] if s == stop_id ] -def get_cache_path(agency_id, version=DefaultVersion): - return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json' +def get_cache_path(agency_id, version=DefaultVersion, use_versioning=False, version_date=None): + # use_versioning is for saving old versions of routes + # It has nothing to do with version=DefaultVersion + if version_date == None: + return f'{util.get_data_dir()}/routes_{version}_{agency_id}_notdated.json' + else: + return f'{util.get_data_dir()}/routes_{version}_{agency_id}_dated_{version_date}/routes_{version}_{agency_id}_dated_{version_date}.json' + + + ##bri##return f"{util.get_data_dir()}/datekeys_{version}_{agency_id}/datekeys_{version}_{agency_id}.json" + def get_s3_path(agency_id, version=DefaultVersion): return f'routes/{version}/routes_{version}_{agency_id}.json.gz' @@ -179,14 +188,27 @@ def get_route_config(agency_id, route_id, version=DefaultVersion): return route return None -def save_routes(agency_id, routes, save_to_s3=False): +def save_routes(agency_id, routes, save_to_s3=False, use_versioning=False, version_date=None): data_str = json.dumps({ 'version': DefaultVersion, 'routes': [route.data for route in routes] }, separators=(',', ':')) - cache_path = get_cache_path(agency_id) - + cache_path = get_cache_path(agency_id, use_versioning=use_versioning, version_date=version_date) + + print(use_versioning) + ##bri## + #if(use_versioning == True): + # cache_path = f'{util.get_data_dir()}/routes_{agency_id}_hey.json' + ##bri##exit()##bri## + #print(cache_path) + ##bri##exit() + + from pathlib import Path + cache_dir = Path(cache_path).parent + if not cache_dir.exists(): + cache_dir.mkdir(parents = True, exist_ok = True) + with open(cache_path, "w") as f: f.write(data_str) diff --git a/backend/save_routes.py b/backend/save_routes.py index d8fe68f7..5e52802b 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -61,8 +61,9 @@ for agency in agencies: scraper = gtfs.GtfsScraper(agency) - scraper.save_new_routes(save_to_s3, d) + scraper.save_old_routes(False, d) + scraper.save_new_routes(save_to_s3, d) if args.timetables: timetables_updated = scraper.save_timetables(save_to_s3=save_to_s3, skip_existing=True) From e56fa1e89bb3efb3d6e7c4370cec6c8182baee1e Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 22 Apr 2020 23:33:17 -0700 Subject: [PATCH 06/55] eliminated unecessary use_versioning variable --- backend/models/gtfs.py | 2 +- backend/models/routeconfig.py | 16 ++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index e677cdb5..b9539ac6 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -1101,4 +1101,4 @@ def save_old_routes(self, save_to_s3, d): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - routeconfig.save_routes(agency_id, routes, save_to_s3=False, use_versioning=True, version_date=d) + routeconfig.save_routes(agency_id, routes, save_to_s3=False, version_date=d) diff --git a/backend/models/routeconfig.py b/backend/models/routeconfig.py index 7e8fa7ba..6f3eec9b 100644 --- a/backend/models/routeconfig.py +++ b/backend/models/routeconfig.py @@ -121,8 +121,8 @@ def get_directions_for_stop(self, stop_id): for s in direction['stops'] if s == stop_id ] -def get_cache_path(agency_id, version=DefaultVersion, use_versioning=False, version_date=None): - # use_versioning is for saving old versions of routes +def get_cache_path(agency_id, version=DefaultVersion, version_date=None): + # version_date is for saving old versions of routes # It has nothing to do with version=DefaultVersion if version_date == None: return f'{util.get_data_dir()}/routes_{version}_{agency_id}_notdated.json' @@ -188,21 +188,13 @@ def get_route_config(agency_id, route_id, version=DefaultVersion): return route return None -def save_routes(agency_id, routes, save_to_s3=False, use_versioning=False, version_date=None): +def save_routes(agency_id, routes, save_to_s3=False, version_date=None): data_str = json.dumps({ 'version': DefaultVersion, 'routes': [route.data for route in routes] }, separators=(',', ':')) - cache_path = get_cache_path(agency_id, use_versioning=use_versioning, version_date=version_date) - - print(use_versioning) - ##bri## - #if(use_versioning == True): - # cache_path = f'{util.get_data_dir()}/routes_{agency_id}_hey.json' - ##bri##exit()##bri## - #print(cache_path) - ##bri##exit() + cache_path = get_cache_path(agency_id, version_date=version_date) from pathlib import Path cache_dir = Path(cache_path).parent From 28b1931f0e1f9c4827d047c9a68d3a6fdb2d0d8d Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 02:11:32 -0700 Subject: [PATCH 07/55] consolidated save_old_routes and save_new_routes into save_routes --- backend/models/gtfs.py | 60 ++++++++++++++++++++++++------------------ backend/save_routes.py | 18 ++++++++++--- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index b9539ac6..82ca2698 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -70,6 +70,37 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir): with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(gtfs_cache_dir) + + + + + +def download_old_gtfs_data(agency: config.Agency, gtfs_cache_dir): + ''' + get an old GFTS file from 2020-02-19 + https://transitfeeds.com/p/sfmta/60/20200219/download + ''' + #gtfs_url = agency.gtfs_url + gtfs_url = "https://transitfeeds.com/p/sfmta/60/20200219/download" + if gtfs_url is None: + raise Exception(f'agency {agency.id} does not have gtfs_url in config') + + cache_dir = Path(gtfs_cache_dir) + if not cache_dir.exists(): + print(f'downloading gtfs data from {gtfs_url}') + r = requests.get(gtfs_url) + + if r.status_code != 200: + raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}") + + zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' + + with open(zip_path, 'wb') as f: + f.write(r.content) + + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(gtfs_cache_dir) + def is_subsequence(smaller, bigger): smaller_len = len(smaller) @@ -113,7 +144,8 @@ def __init__(self, agency: config.Agency): self.agency_id = agency_id = agency.id gtfs_cache_dir = f'{util.get_data_dir()}/gtfs-{agency_id}' - download_gtfs_data(agency, gtfs_cache_dir) + #download_gtfs_data(agency, gtfs_cache_dir) + download_old_gtfs_data(agency, gtfs_cache_dir) self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) @@ -1059,29 +1091,7 @@ def get_sort_key(route_data): return route_data['title'] return sorted(routes_data, key=get_sort_key) - def save_new_routes(self, save_to_s3, d): - agency = self.agency - agency_id = agency.id - routes_df = self.get_gtfs_routes() - routes_df = self.get_active_routes(routes_df, d) - if len(routes_df) == 0: - self.errors.append(( - f'Zero active routes for {agency_id}, the routes config was not updated. ' - f'Ensure the GTFS is active for the given date {d}' - )) - return - - routes_data = [ - self.get_route_data(route) - for route in routes_df.itertuples() - ] - routes_data = self.sort_routes(routes_data) - - routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - - routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3) - - def save_old_routes(self, save_to_s3, d): + def save_routes(self, save_to_s3, d, version_date=None): agency = self.agency agency_id = agency.id routes_df = self.get_gtfs_routes() @@ -1101,4 +1111,4 @@ def save_old_routes(self, save_to_s3, d): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - routeconfig.save_routes(agency_id, routes, save_to_s3=False, version_date=d) + routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3, version_date=version_date) diff --git a/backend/save_routes.py b/backend/save_routes.py index 5e52802b..49d72f71 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -52,18 +52,30 @@ save_to_s3 = args.s3 d = date.today() + ##bri## # dont forget to take this out so the date is truly today # this fake date is for testing import datetime - d = d + datetime.timedelta(days=3) + #d = d + datetime.timedelta(days=3) errors = [] for agency in agencies: scraper = gtfs.GtfsScraper(agency) - scraper.save_old_routes(False, d) - scraper.save_new_routes(save_to_s3, d) + + scraper.save_routes(save_to_s3, d) + + ''' + use https://transitfeeds.com/api/swagger/ + to get old routes + and cache them in date versioned folders + ''' + ##bri## set save_to_s3 to False for archived routes + ##bri## figure out what date to really put in for d here + #scraper.save_old_routes(False, d) + scraper.save_routes(False, d, version_date=d) + if args.timetables: timetables_updated = scraper.save_timetables(save_to_s3=save_to_s3, skip_existing=True) From cb558569d3ed9e6567464642b372ba89e360b664 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 03:24:08 -0700 Subject: [PATCH 08/55] framework for executing a scrape saving routes normally followed by a scrape archiving routes --- backend/models/gtfs.py | 33 +++++++++++++++++++++++++++------ backend/save_routes.py | 12 ++++++------ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 82ca2698..fb3aca14 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -50,12 +50,32 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde 'offset': int(best_offset) # distance in meters between this stop and the closest line segment of shape } -def download_gtfs_data(agency: config.Agency, gtfs_cache_dir): - gtfs_url = agency.gtfs_url +def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=False): + cache_dir = Path(gtfs_cache_dir) + zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' + import os + if archiving_old == False: + gtfs_url = agency.gtfs_url + else: + ''' + get an old GFTS file from 2020-02-19 + https://transitfeeds.com/p/sfmta/60/20200219/download + ''' + gtfs_url = "https://transitfeeds.com/p/sfmta/60/20200219/download" + # need to delete existing zip file and directory in order + # to reuse for the archiving passes + + if cache_dir.exists(): + import shutil + shutil.rmtree(cache_dir) + print('removed',cache_dir) + os.remove(zip_path) + print('removed',zip_path) + if gtfs_url is None: raise Exception(f'agency {agency.id} does not have gtfs_url in config') - cache_dir = Path(gtfs_cache_dir) + if not cache_dir.exists(): print(f'downloading gtfs data from {gtfs_url}') r = requests.get(gtfs_url) @@ -63,6 +83,7 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir): if r.status_code != 200: raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}") + ##bri## should not redefine zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' with open(zip_path, 'wb') as f: @@ -139,13 +160,13 @@ def contains_excluded_stop(shape_stop_ids, excluded_stop_ids): return False class GtfsScraper: - def __init__(self, agency: config.Agency): + def __init__(self, agency: config.Agency, archiving_old=False): self.agency = agency self.agency_id = agency_id = agency.id gtfs_cache_dir = f'{util.get_data_dir()}/gtfs-{agency_id}' - #download_gtfs_data(agency, gtfs_cache_dir) - download_old_gtfs_data(agency, gtfs_cache_dir) + download_gtfs_data(agency, gtfs_cache_dir, archiving_old=archiving_old) + #download_old_gtfs_data(agency, gtfs_cache_dir) self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) diff --git a/backend/save_routes.py b/backend/save_routes.py index 49d72f71..7646d556 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -56,14 +56,12 @@ # dont forget to take this out so the date is truly today # this fake date is for testing import datetime - #d = d + datetime.timedelta(days=3) + d = d + datetime.timedelta(days=3) errors = [] for agency in agencies: - scraper = gtfs.GtfsScraper(agency) - - + scraper = gtfs.GtfsScraper(agency, archiving_old=False) scraper.save_routes(save_to_s3, d) ''' @@ -73,8 +71,9 @@ ''' ##bri## set save_to_s3 to False for archived routes ##bri## figure out what date to really put in for d here - #scraper.save_old_routes(False, d) - scraper.save_routes(False, d, version_date=d) + #scraper.save_old_routes(False, d) + scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True) + scraper_archiving.save_routes(False, d, version_date=d) if args.timetables: @@ -85,6 +84,7 @@ compute_stats_for_dates(dates, agency, scheduled=True, save_to_s3=save_to_s3) errors += scraper.errors + errors += scraper_archiving.errors if errors: raise Exception("\n".join(errors)) From 33a8e53581177509cf7c2520d81818bd29e2ddaf Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 03:31:21 -0700 Subject: [PATCH 09/55] removed 'notdated' from non-archived routes JSON files --- backend/models/routeconfig.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/models/routeconfig.py b/backend/models/routeconfig.py index 6f3eec9b..e6cd96dc 100644 --- a/backend/models/routeconfig.py +++ b/backend/models/routeconfig.py @@ -125,7 +125,7 @@ def get_cache_path(agency_id, version=DefaultVersion, version_date=None): # version_date is for saving old versions of routes # It has nothing to do with version=DefaultVersion if version_date == None: - return f'{util.get_data_dir()}/routes_{version}_{agency_id}_notdated.json' + return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json' else: return f'{util.get_data_dir()}/routes_{version}_{agency_id}_dated_{version_date}/routes_{version}_{agency_id}_dated_{version_date}.json' From 690e0cc9be38a11050afaf5dd3fcd801dda8f763 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 03:42:07 -0700 Subject: [PATCH 10/55] removed unused method download_gtfs_data and 'dated' from filenames --- backend/models/gtfs.py | 29 ----------------------------- backend/models/routeconfig.py | 2 +- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index fb3aca14..20bd443e 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -94,35 +94,6 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=Fals - - -def download_old_gtfs_data(agency: config.Agency, gtfs_cache_dir): - ''' - get an old GFTS file from 2020-02-19 - https://transitfeeds.com/p/sfmta/60/20200219/download - ''' - #gtfs_url = agency.gtfs_url - gtfs_url = "https://transitfeeds.com/p/sfmta/60/20200219/download" - if gtfs_url is None: - raise Exception(f'agency {agency.id} does not have gtfs_url in config') - - cache_dir = Path(gtfs_cache_dir) - if not cache_dir.exists(): - print(f'downloading gtfs data from {gtfs_url}') - r = requests.get(gtfs_url) - - if r.status_code != 200: - raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}") - - zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' - - with open(zip_path, 'wb') as f: - f.write(r.content) - - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall(gtfs_cache_dir) - - def is_subsequence(smaller, bigger): smaller_len = len(smaller) bigger_len = len(bigger) diff --git a/backend/models/routeconfig.py b/backend/models/routeconfig.py index e6cd96dc..4284fcbc 100644 --- a/backend/models/routeconfig.py +++ b/backend/models/routeconfig.py @@ -127,7 +127,7 @@ def get_cache_path(agency_id, version=DefaultVersion, version_date=None): if version_date == None: return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json' else: - return f'{util.get_data_dir()}/routes_{version}_{agency_id}_dated_{version_date}/routes_{version}_{agency_id}_dated_{version_date}.json' + return f'{util.get_data_dir()}/routes_{version}_{agency_id}_{version_date}/routes_{version}_{agency_id}_{version_date}.json' ##bri##return f"{util.get_data_dir()}/datekeys_{version}_{agency_id}/datekeys_{version}_{agency_id}.json" From 0c9fba64669bb53f09d68371a1658e91d90344cb Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 03:50:03 -0700 Subject: [PATCH 11/55] put in a more realistic date for the archive date version for the single archived route --- backend/save_routes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 7646d556..811edd4f 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -71,9 +71,10 @@ ''' ##bri## set save_to_s3 to False for archived routes ##bri## figure out what date to really put in for d here - #scraper.save_old_routes(False, d) scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True) - scraper_archiving.save_routes(False, d, version_date=d) + #scraper_archiving.save_routes(False, d, version_date=d) + scraper_archiving.save_routes(False, d, version_date='2020-02-19') + if args.timetables: From 26f17727c1aa616b717605f36f2a453caba6aaa8 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 03:57:50 -0700 Subject: [PATCH 12/55] moved imports to the top --- backend/models/gtfs.py | 4 ++-- backend/models/routeconfig.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 20bd443e..debd21ff 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -8,6 +8,8 @@ import gzip import hashlib import zipfile +import shutil +import os from . import config, util, nextbus, routeconfig, timetables @@ -53,7 +55,6 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=False): cache_dir = Path(gtfs_cache_dir) zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' - import os if archiving_old == False: gtfs_url = agency.gtfs_url else: @@ -66,7 +67,6 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=Fals # to reuse for the archiving passes if cache_dir.exists(): - import shutil shutil.rmtree(cache_dir) print('removed',cache_dir) os.remove(zip_path) diff --git a/backend/models/routeconfig.py b/backend/models/routeconfig.py index 4284fcbc..568948e2 100644 --- a/backend/models/routeconfig.py +++ b/backend/models/routeconfig.py @@ -1,4 +1,5 @@ import re, os, time, requests, json, boto3, gzip +from pathlib import Path from . import util, config DefaultVersion = 'v3a' @@ -195,8 +196,6 @@ def save_routes(agency_id, routes, save_to_s3=False, version_date=None): }, separators=(',', ':')) cache_path = get_cache_path(agency_id, version_date=version_date) - - from pathlib import Path cache_dir = Path(cache_path).parent if not cache_dir.exists(): cache_dir.mkdir(parents = True, exist_ok = True) From 99982993f768a64dac1b62f816ad7fdbf0c56f0b Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 04:01:20 -0700 Subject: [PATCH 13/55] added reminder comment to properly get archived GTFS data --- backend/models/gtfs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index debd21ff..24fb8b98 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -59,6 +59,8 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=Fals gtfs_url = agency.gtfs_url else: ''' + need to set up a system for properly getting URLs for archiving routes + get an old GFTS file from 2020-02-19 https://transitfeeds.com/p/sfmta/60/20200219/download ''' From ee2ebe5adbf6d44045f5d1f5c29ea232aea6bb8d Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 04:52:22 -0700 Subject: [PATCH 14/55] can add multiple archive urls to archive routes --- backend/models/gtfs.py | 12 +++++------- backend/save_routes.py | 16 ++++++++++++---- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 24fb8b98..3091d714 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -52,7 +52,7 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde 'offset': int(best_offset) # distance in meters between this stop and the closest line segment of shape } -def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=False): +def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=False, archiving_url=None): cache_dir = Path(gtfs_cache_dir) zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' if archiving_old == False: @@ -64,7 +64,8 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=Fals get an old GFTS file from 2020-02-19 https://transitfeeds.com/p/sfmta/60/20200219/download ''' - gtfs_url = "https://transitfeeds.com/p/sfmta/60/20200219/download" + #####gtfs_url = "https://transitfeeds.com/p/sfmta/60/20200219/download" + gtfs_url = archiving_url # need to delete existing zip file and directory in order # to reuse for the archiving passes @@ -85,9 +86,6 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=Fals if r.status_code != 200: raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}") - ##bri## should not redefine - zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' - with open(zip_path, 'wb') as f: f.write(r.content) @@ -133,12 +131,12 @@ def contains_excluded_stop(shape_stop_ids, excluded_stop_ids): return False class GtfsScraper: - def __init__(self, agency: config.Agency, archiving_old=False): + def __init__(self, agency: config.Agency, archiving_old=False, archiving_url=None): self.agency = agency self.agency_id = agency_id = agency.id gtfs_cache_dir = f'{util.get_data_dir()}/gtfs-{agency_id}' - download_gtfs_data(agency, gtfs_cache_dir, archiving_old=archiving_old) + download_gtfs_data(agency, gtfs_cache_dir, archiving_old=archiving_old, archiving_url=archiving_url) #download_old_gtfs_data(agency, gtfs_cache_dir) self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) diff --git a/backend/save_routes.py b/backend/save_routes.py index 811edd4f..52dd3a68 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -63,7 +63,7 @@ for agency in agencies: scraper = gtfs.GtfsScraper(agency, archiving_old=False) scraper.save_routes(save_to_s3, d) - + errors += scraper.errors ''' use https://transitfeeds.com/api/swagger/ to get old routes @@ -71,9 +71,17 @@ ''' ##bri## set save_to_s3 to False for archived routes ##bri## figure out what date to really put in for d here - scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True) + archiving_url = 'https://transitfeeds.com/p/sfmta/60/20200219/download' + scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True, archiving_url=archiving_url) #scraper_archiving.save_routes(False, d, version_date=d) scraper_archiving.save_routes(False, d, version_date='2020-02-19') + errors += scraper_archiving.errors + + archiving_url = 'https://transitfeeds.com/p/sfmta/60/20200409/download' + scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True, archiving_url=archiving_url) + scraper_archiving.save_routes(False, d, version_date='2020-04-09') + errors += scraper_archiving.errors + @@ -84,8 +92,8 @@ dates = sorted(scraper.get_services_by_date().keys()) compute_stats_for_dates(dates, agency, scheduled=True, save_to_s3=save_to_s3) - errors += scraper.errors - errors += scraper_archiving.errors + + if errors: raise Exception("\n".join(errors)) From 50812c4ab18291e9bdb366887c403dbc3832425d Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 05:00:20 -0700 Subject: [PATCH 15/55] pulling archive urls from a list --- backend/save_routes.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 52dd3a68..01371eb6 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -69,15 +69,19 @@ to get old routes and cache them in date versioned folders ''' + + archiving_urls = [] + archiving_urls.append('https://transitfeeds.com/p/sfmta/60/20200219/download') + archiving_urls.append('https://transitfeeds.com/p/sfmta/60/20200409/download') ##bri## set save_to_s3 to False for archived routes ##bri## figure out what date to really put in for d here - archiving_url = 'https://transitfeeds.com/p/sfmta/60/20200219/download' + archiving_url = archiving_urls.pop() scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True, archiving_url=archiving_url) #scraper_archiving.save_routes(False, d, version_date=d) scraper_archiving.save_routes(False, d, version_date='2020-02-19') errors += scraper_archiving.errors - archiving_url = 'https://transitfeeds.com/p/sfmta/60/20200409/download' + archiving_url = archiving_urls.pop() scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True, archiving_url=archiving_url) scraper_archiving.save_routes(False, d, version_date='2020-04-09') errors += scraper_archiving.errors From 5f7112fcd27b49b55867cc59043393013cb04445 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 23 Apr 2020 05:30:54 -0700 Subject: [PATCH 16/55] make url from date and loop through archiving urls for archiving routes --- backend/save_routes.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 01371eb6..679a92e8 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -70,23 +70,27 @@ and cache them in date versioned folders ''' - archiving_urls = [] - archiving_urls.append('https://transitfeeds.com/p/sfmta/60/20200219/download') - archiving_urls.append('https://transitfeeds.com/p/sfmta/60/20200409/download') - ##bri## set save_to_s3 to False for archived routes - ##bri## figure out what date to really put in for d here - archiving_url = archiving_urls.pop() - scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True, archiving_url=archiving_url) - #scraper_archiving.save_routes(False, d, version_date=d) - scraper_archiving.save_routes(False, d, version_date='2020-02-19') - errors += scraper_archiving.errors + archiving_dates = [] + archiving_dates.append('2020-02-19') + archiving_dates.append('2020-04-09') + archiving_dates.append('2020-01-26') + archiving_dates.append('2020-03-28') - archiving_url = archiving_urls.pop() - scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True, archiving_url=archiving_url) - scraper_archiving.save_routes(False, d, version_date='2020-04-09') - errors += scraper_archiving.errors + #archiving_urls = [] + #archiving_urls.append('https://transitfeeds.com/p/sfmta/60/20200219/download') + #archiving_urls.append('https://transitfeeds.com/p/sfmta/60/20200409/download') + ##bri## set save_to_s3 to False for archived routes + ##bri## figure out what date to really put in for d here + while(len(archiving_dates) > 0): + archiving_date = archiving_dates.pop() + archiving_url = 'https://transitfeeds.com/p/sfmta/60/' + archiving_date.replace("-","") + '/download' + scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True, archiving_url=archiving_url) + scraper_archiving.save_routes(False, d, version_date=archiving_date) + errors += scraper_archiving.errors + + if args.timetables: From 652a45920dc02ed7aa9a8279dce4d02f0fbcb50d Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Fri, 24 Apr 2020 02:27:38 -0700 Subject: [PATCH 17/55] use transitfeeds api to get old routes to version by date and cache - need to supply own api key --- backend/models/gtfs.py | 1 + backend/save_routes.py | 42 +++++++++++++++++++----------------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 3091d714..6619cbea 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -70,6 +70,7 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=Fals # to reuse for the archiving passes if cache_dir.exists(): + #exit() shutil.rmtree(cache_dir) print('removed',cache_dir) os.remove(zip_path) diff --git a/backend/save_routes.py b/backend/save_routes.py index 679a92e8..6a8bd7e0 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -2,6 +2,8 @@ from compute_stats import compute_stats_for_dates import argparse from datetime import date +import requests +from secrets import transitfeeds_api_key # Downloads and parses the GTFS specification # and saves the configuration for all routes to S3. @@ -34,7 +36,7 @@ # # Currently the script just overwrites the one S3 path, but this process could be extended in the future to # store different paths for different dates, to allow fetching historical data for route configurations. -# +# UPDATE: We are now saving some older routes in versioned directories in metrics-mvp/backend/data if __name__ == '__main__': parser = argparse.ArgumentParser(description='Save route configuration from GTFS and possibly Nextbus API') @@ -52,14 +54,21 @@ save_to_s3 = args.s3 d = date.today() - ##bri## - # dont forget to take this out so the date is truly today - # this fake date is for testing - import datetime - d = d + datetime.timedelta(days=3) - + archive_date = date.today() errors = [] + limit = '10' + urls_feed = 'https://api.transitfeeds.com/v1/getFeedVersions?key=' + transitfeeds_api_key + '&feed=sfmta%2F60&page=1&limit=' + limit + '&err=1&warn=1' + + response = requests.get(urls_feed) + data = response.json() + archiving_urls = [] + archiving_dates = [] + for i in range(len(data['results']['versions'])): + archiving_urls.append(data['results']['versions'][i]['url']) + archiving_dates.append(archiving_urls[i].split('/')[6]) + archiving_dates[i] = archiving_dates[i][:4]+'-'+archiving_dates[i][4:6]+'-'+archiving_dates[i][6:] + for agency in agencies: scraper = gtfs.GtfsScraper(agency, archiving_old=False) scraper.save_routes(save_to_s3, d) @@ -68,31 +77,18 @@ use https://transitfeeds.com/api/swagger/ to get old routes and cache them in date versioned folders - ''' - archiving_dates = [] - archiving_dates.append('2020-02-19') - archiving_dates.append('2020-04-09') - archiving_dates.append('2020-01-26') - archiving_dates.append('2020-03-28') + ''' + - - #archiving_urls = [] - #archiving_urls.append('https://transitfeeds.com/p/sfmta/60/20200219/download') - #archiving_urls.append('https://transitfeeds.com/p/sfmta/60/20200409/download') - ##bri## set save_to_s3 to False for archived routes - ##bri## figure out what date to really put in for d here while(len(archiving_dates) > 0): archiving_date = archiving_dates.pop() - archiving_url = 'https://transitfeeds.com/p/sfmta/60/' + archiving_date.replace("-","") + '/download' + archiving_url = archiving_urls.pop() scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True, archiving_url=archiving_url) scraper_archiving.save_routes(False, d, version_date=archiving_date) errors += scraper_archiving.errors - - - if args.timetables: timetables_updated = scraper.save_timetables(save_to_s3=save_to_s3, skip_existing=True) From 36565da4e3c621d25c048757a8b15cfeb177c02a Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Fri, 24 Apr 2020 02:46:08 -0700 Subject: [PATCH 18/55] eliminate unecessary param archiving_old and other cleanup --- backend/models/gtfs.py | 23 ++++++----------------- backend/save_routes.py | 4 ++-- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 6619cbea..55ba1e45 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -52,25 +52,17 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde 'offset': int(best_offset) # distance in meters between this stop and the closest line segment of shape } -def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=False, archiving_url=None): +def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_url=None): cache_dir = Path(gtfs_cache_dir) zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' - if archiving_old == False: + if archiving_url == None: gtfs_url = agency.gtfs_url else: - ''' - need to set up a system for properly getting URLs for archiving routes - - get an old GFTS file from 2020-02-19 - https://transitfeeds.com/p/sfmta/60/20200219/download - ''' - #####gtfs_url = "https://transitfeeds.com/p/sfmta/60/20200219/download" gtfs_url = archiving_url + # need to delete existing zip file and directory in order # to reuse for the archiving passes - if cache_dir.exists(): - #exit() shutil.rmtree(cache_dir) print('removed',cache_dir) os.remove(zip_path) @@ -93,8 +85,7 @@ def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_old=Fals with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(gtfs_cache_dir) - - + def is_subsequence(smaller, bigger): smaller_len = len(smaller) bigger_len = len(bigger) @@ -132,16 +123,14 @@ def contains_excluded_stop(shape_stop_ids, excluded_stop_ids): return False class GtfsScraper: - def __init__(self, agency: config.Agency, archiving_old=False, archiving_url=None): + def __init__(self, agency: config.Agency, archiving_url=None): self.agency = agency self.agency_id = agency_id = agency.id gtfs_cache_dir = f'{util.get_data_dir()}/gtfs-{agency_id}' - download_gtfs_data(agency, gtfs_cache_dir, archiving_old=archiving_old, archiving_url=archiving_url) - #download_old_gtfs_data(agency, gtfs_cache_dir) + download_gtfs_data(agency, gtfs_cache_dir, archiving_url=archiving_url) self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) - self.errors = [] self.stop_times_by_trip = None self.stops_df = None diff --git a/backend/save_routes.py b/backend/save_routes.py index 6a8bd7e0..fe11e852 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -70,7 +70,7 @@ archiving_dates[i] = archiving_dates[i][:4]+'-'+archiving_dates[i][4:6]+'-'+archiving_dates[i][6:] for agency in agencies: - scraper = gtfs.GtfsScraper(agency, archiving_old=False) + scraper = gtfs.GtfsScraper(agency, archiving_url=None) scraper.save_routes(save_to_s3, d) errors += scraper.errors ''' @@ -85,7 +85,7 @@ while(len(archiving_dates) > 0): archiving_date = archiving_dates.pop() archiving_url = archiving_urls.pop() - scraper_archiving = gtfs.GtfsScraper(agency, archiving_old=True, archiving_url=archiving_url) + scraper_archiving = gtfs.GtfsScraper(agency, archiving_url=archiving_url) scraper_archiving.save_routes(False, d, version_date=archiving_date) errors += scraper_archiving.errors From 7ac1b10dcae016f0cabf4056892f842fd1cd83e8 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Fri, 24 Apr 2020 02:50:49 -0700 Subject: [PATCH 19/55] some cleanup --- backend/save_routes.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index fe11e852..0e415a74 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -3,7 +3,7 @@ import argparse from datetime import date import requests -from secrets import transitfeeds_api_key +from secrets import transitfeeds_api_key # you may have to create this # Downloads and parses the GTFS specification # and saves the configuration for all routes to S3. @@ -80,8 +80,6 @@ ''' - - while(len(archiving_dates) > 0): archiving_date = archiving_dates.pop() archiving_url = archiving_urls.pop() @@ -97,7 +95,5 @@ compute_stats_for_dates(dates, agency, scheduled=True, save_to_s3=save_to_s3) - - if errors: raise Exception("\n".join(errors)) From 6ebe6554b38fe041c2cd5e6971ded7473d10830d Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 00:54:07 -0700 Subject: [PATCH 20/55] passed archiving_date instead of current date per reviewer suggestion --- backend/save_routes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 0e415a74..049864a7 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -1,7 +1,7 @@ from models import gtfs, config from compute_stats import compute_stats_for_dates import argparse -from datetime import date +from datetime import date, datetime import requests from secrets import transitfeeds_api_key # you may have to create this @@ -83,8 +83,8 @@ while(len(archiving_dates) > 0): archiving_date = archiving_dates.pop() archiving_url = archiving_urls.pop() - scraper_archiving = gtfs.GtfsScraper(agency, archiving_url=archiving_url) - scraper_archiving.save_routes(False, d, version_date=archiving_date) + scraper_archiving = gtfs.GtfsScraper(agency, archiving_url=archiving_url) + scraper_archiving.save_routes(False, datetime.strptime(archiving_date, "%Y-%m-%d").date(), version_date=archiving_date) errors += scraper_archiving.errors if args.timetables: From ff9500b85a30fc94321f71f6016e22b30ec4eb94 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 01:04:47 -0700 Subject: [PATCH 21/55] remove unecessary archive_date --- backend/save_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 049864a7..391e24e5 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -54,7 +54,7 @@ save_to_s3 = args.s3 d = date.today() - archive_date = date.today() + errors = [] limit = '10' From e618a1f667170ede8672fc5e109f67cc496fe673 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 04:02:06 -0700 Subject: [PATCH 22/55] framework to take archiving_date argument --- backend/models/gtfs.py | 60 ++++++++++++++++++++++++------------------ backend/save_routes.py | 40 +++++++++------------------- 2 files changed, 46 insertions(+), 54 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 55ba1e45..dd46a198 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -51,40 +51,48 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde 'after_index': best_index, # the index of the coordinate of the shape just before this stop 'offset': int(best_offset) # distance in meters between this stop and the closest line segment of shape } + -def download_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_url=None): - cache_dir = Path(gtfs_cache_dir) - zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' - if archiving_url == None: + +# dont forget to refactor to remove repetition with download_gtfs_data +def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_date=None): + if archiving_date == None: + cache_dir = Path(gtfs_cache_dir) + zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' gtfs_url = agency.gtfs_url - else: - gtfs_url = archiving_url - - # need to delete existing zip file and directory in order - # to reuse for the archiving passes - if cache_dir.exists(): - shutil.rmtree(cache_dir) - print('removed',cache_dir) - os.remove(zip_path) - print('removed',zip_path) + - if gtfs_url is None: - raise Exception(f'agency {agency.id} does not have gtfs_url in config') + if gtfs_url is None: + raise Exception(f'agency {agency.id} does not have gtfs_url in config') + + if not cache_dir.exists(): + print(f'downloading gtfs data from {gtfs_url}') + r = requests.get(gtfs_url) - if not cache_dir.exists(): - print(f'downloading gtfs data from {gtfs_url}') - r = requests.get(gtfs_url) + if r.status_code != 200: + raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}") - if r.status_code != 200: - raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}") + with open(zip_path, 'wb') as f: + f.write(r.content) + + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(gtfs_cache_dir) + + else: + + cache_dir = Path(gtfs_cache_dir) + #zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' + gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{archiving_date}.zip' + print(gtfs_path) + print(gtfs_cache_dir) + zip_path = gtfs_path - with open(zip_path, 'wb') as f: - f.write(r.content) with zipfile.ZipFile(zip_path, 'r') as zip_ref: + #zip_ref.extractall(gtfs_cache_dir+archiving_date) zip_ref.extractall(gtfs_cache_dir) - + def is_subsequence(smaller, bigger): smaller_len = len(smaller) @@ -123,12 +131,12 @@ def contains_excluded_stop(shape_stop_ids, excluded_stop_ids): return False class GtfsScraper: - def __init__(self, agency: config.Agency, archiving_url=None): + def __init__(self, agency: config.Agency, archiving_date=None): self.agency = agency self.agency_id = agency_id = agency.id gtfs_cache_dir = f'{util.get_data_dir()}/gtfs-{agency_id}' - download_gtfs_data(agency, gtfs_cache_dir, archiving_url=archiving_url) + get_gtfs_data(agency, gtfs_cache_dir, archiving_date=archiving_date) self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) self.errors = [] diff --git a/backend/save_routes.py b/backend/save_routes.py index 391e24e5..dcbc101c 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -44,9 +44,11 @@ parser.add_argument('--s3', dest='s3', action='store_true', help='store in s3') parser.add_argument('--timetables', dest='timetables', action='store_true', help='also save timetables') parser.add_argument('--scheduled-stats', dest='scheduled_stats', action='store_true', help='also compute scheduled stats if the timetable has new dates (requires --timetables)') + parser.add_argument('--archiving_date', required=False) parser.set_defaults(s3=False) parser.set_defaults(timetables=False) parser.set_defaults(scheduled_stats=False) + parser.set_defaults(archiving_date=None) args = parser.parse_args() @@ -54,39 +56,21 @@ save_to_s3 = args.s3 d = date.today() + archiving_date = args.archiving_date errors = [] - - limit = '10' - urls_feed = 'https://api.transitfeeds.com/v1/getFeedVersions?key=' + transitfeeds_api_key + '&feed=sfmta%2F60&page=1&limit=' + limit + '&err=1&warn=1' - - response = requests.get(urls_feed) - data = response.json() - archiving_urls = [] - archiving_dates = [] - for i in range(len(data['results']['versions'])): - archiving_urls.append(data['results']['versions'][i]['url']) - archiving_dates.append(archiving_urls[i].split('/')[6]) - archiving_dates[i] = archiving_dates[i][:4]+'-'+archiving_dates[i][4:6]+'-'+archiving_dates[i][6:] for agency in agencies: - scraper = gtfs.GtfsScraper(agency, archiving_url=None) - scraper.save_routes(save_to_s3, d) - errors += scraper.errors - ''' - use https://transitfeeds.com/api/swagger/ - to get old routes - and cache them in date versioned folders - - ''' - - while(len(archiving_dates) > 0): - archiving_date = archiving_dates.pop() - archiving_url = archiving_urls.pop() - scraper_archiving = gtfs.GtfsScraper(agency, archiving_url=archiving_url) + if archiving_date is None: + scraper = gtfs.GtfsScraper(agency) + scraper.save_routes(save_to_s3, d) + errors += scraper.errors + else: + scraper_archiving = gtfs.GtfsScraper(agency, archiving_date=archiving_date) scraper_archiving.save_routes(False, datetime.strptime(archiving_date, "%Y-%m-%d").date(), version_date=archiving_date) - errors += scraper_archiving.errors - + errors += scraper_archiving.errors + + if args.timetables: timetables_updated = scraper.save_timetables(save_to_s3=save_to_s3, skip_existing=True) From aa0b423b35eccc7cb5d1f7e3512044560711e09b Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 04:17:38 -0700 Subject: [PATCH 23/55] added some comments --- backend/save_routes.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/save_routes.py b/backend/save_routes.py index dcbc101c..2e03968c 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -60,12 +60,17 @@ errors = [] + # should probably change things so we supply gtfs_path + # instead of archiving_date + for agency in agencies: if archiving_date is None: + # save the normal way, downloading the most recent GTFS file scraper = gtfs.GtfsScraper(agency) scraper.save_routes(save_to_s3, d) errors += scraper.errors else: + # save with date suffix, using the GTFS file provided scraper_archiving = gtfs.GtfsScraper(agency, archiving_date=archiving_date) scraper_archiving.save_routes(False, datetime.strptime(archiving_date, "%Y-%m-%d").date(), version_date=archiving_date) errors += scraper_archiving.errors From d4f13f82068b31fce7f36e4d6061339e06512c40 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 04:50:38 -0700 Subject: [PATCH 24/55] changed archived_date to gtfs_date --- backend/models/gtfs.py | 13 +++++-------- backend/save_routes.py | 14 +++++++------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index dd46a198..400d19c2 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -55,8 +55,8 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde # dont forget to refactor to remove repetition with download_gtfs_data -def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_date=None): - if archiving_date == None: +def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_date=None): + if gtfs_date == None: cache_dir = Path(gtfs_cache_dir) zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' gtfs_url = agency.gtfs_url @@ -80,17 +80,14 @@ def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, archiving_date=None): zip_ref.extractall(gtfs_cache_dir) else: - cache_dir = Path(gtfs_cache_dir) - #zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' - gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{archiving_date}.zip' + gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' print(gtfs_path) print(gtfs_cache_dir) zip_path = gtfs_path with zipfile.ZipFile(zip_path, 'r') as zip_ref: - #zip_ref.extractall(gtfs_cache_dir+archiving_date) zip_ref.extractall(gtfs_cache_dir) @@ -131,12 +128,12 @@ def contains_excluded_stop(shape_stop_ids, excluded_stop_ids): return False class GtfsScraper: - def __init__(self, agency: config.Agency, archiving_date=None): + def __init__(self, agency: config.Agency, gtfs_date=None): self.agency = agency self.agency_id = agency_id = agency.id gtfs_cache_dir = f'{util.get_data_dir()}/gtfs-{agency_id}' - get_gtfs_data(agency, gtfs_cache_dir, archiving_date=archiving_date) + get_gtfs_data(agency, gtfs_cache_dir, gtfs_date=gtfs_date) self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) self.errors = [] diff --git a/backend/save_routes.py b/backend/save_routes.py index 2e03968c..f5161560 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -44,11 +44,11 @@ parser.add_argument('--s3', dest='s3', action='store_true', help='store in s3') parser.add_argument('--timetables', dest='timetables', action='store_true', help='also save timetables') parser.add_argument('--scheduled-stats', dest='scheduled_stats', action='store_true', help='also compute scheduled stats if the timetable has new dates (requires --timetables)') - parser.add_argument('--archiving_date', required=False) + parser.add_argument('--gtfs_date', required=False) parser.set_defaults(s3=False) parser.set_defaults(timetables=False) parser.set_defaults(scheduled_stats=False) - parser.set_defaults(archiving_date=None) + parser.set_defaults(gtfs_date=None) args = parser.parse_args() @@ -56,23 +56,23 @@ save_to_s3 = args.s3 d = date.today() - archiving_date = args.archiving_date + gtfs_date = args.gtfs_date errors = [] # should probably change things so we supply gtfs_path - # instead of archiving_date + # instead of gtfs_date for agency in agencies: - if archiving_date is None: + if gtfs_date is None: # save the normal way, downloading the most recent GTFS file scraper = gtfs.GtfsScraper(agency) scraper.save_routes(save_to_s3, d) errors += scraper.errors else: # save with date suffix, using the GTFS file provided - scraper_archiving = gtfs.GtfsScraper(agency, archiving_date=archiving_date) - scraper_archiving.save_routes(False, datetime.strptime(archiving_date, "%Y-%m-%d").date(), version_date=archiving_date) + scraper_archiving = gtfs.GtfsScraper(agency, gtfs_date=gtfs_date) + scraper_archiving.save_routes(False, datetime.strptime(gtfs_date, "%Y-%m-%d").date(), version_date=gtfs_date) errors += scraper_archiving.errors From 6e0c3b361bc8d8cf5662a3a2917e9204d9c7face Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 05:10:22 -0700 Subject: [PATCH 25/55] combined GtfsScraper calls for both cases --- backend/models/gtfs.py | 3 --- backend/save_routes.py | 16 ++++++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 400d19c2..f85ed7e9 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -82,11 +82,8 @@ def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_date=None): else: cache_dir = Path(gtfs_cache_dir) gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' - print(gtfs_path) - print(gtfs_cache_dir) zip_path = gtfs_path - with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(gtfs_cache_dir) diff --git a/backend/save_routes.py b/backend/save_routes.py index f5161560..18091da7 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -64,16 +64,20 @@ # instead of gtfs_date for agency in agencies: + if gtfs_date is None: # save the normal way, downloading the most recent GTFS file - scraper = gtfs.GtfsScraper(agency) - scraper.save_routes(save_to_s3, d) - errors += scraper.errors + date_to_use=d else: # save with date suffix, using the GTFS file provided - scraper_archiving = gtfs.GtfsScraper(agency, gtfs_date=gtfs_date) - scraper_archiving.save_routes(False, datetime.strptime(gtfs_date, "%Y-%m-%d").date(), version_date=gtfs_date) - errors += scraper_archiving.errors + save_to_s3=False + date_to_use=datetime.strptime(gtfs_date, "%Y-%m-%d").date() + + # saves the routes + scraper = gtfs.GtfsScraper(agency, gtfs_date=gtfs_date) + scraper.save_routes(save_to_s3, date_to_use, version_date=gtfs_date) + errors += scraper.errors + if args.timetables: From 24f56e98a4001de435e8c34644b4cd05077eb8ec Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 05:16:11 -0700 Subject: [PATCH 26/55] eliminated variable d --- backend/save_routes.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 18091da7..5c3b54d8 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -55,7 +55,6 @@ agencies = [config.get_agency(args.agency)] if args.agency is not None else config.agencies save_to_s3 = args.s3 - d = date.today() gtfs_date = args.gtfs_date errors = [] @@ -67,19 +66,18 @@ if gtfs_date is None: # save the normal way, downloading the most recent GTFS file - date_to_use=d + date_to_use=date.today() else: # save with date suffix, using the GTFS file provided save_to_s3=False date_to_use=datetime.strptime(gtfs_date, "%Y-%m-%d").date() - # saves the routes + # save the routes scraper = gtfs.GtfsScraper(agency, gtfs_date=gtfs_date) scraper.save_routes(save_to_s3, date_to_use, version_date=gtfs_date) errors += scraper.errors - - + if args.timetables: timetables_updated = scraper.save_timetables(save_to_s3=save_to_s3, skip_existing=True) From bb7dbf49cea9e352857f61ab16e66d3279cfbc61 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 13:43:15 -0700 Subject: [PATCH 27/55] added backwards date search if gtfs_date doesnt match exact zipfile date --- backend/models/gtfs.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index f85ed7e9..138e89c4 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -10,6 +10,7 @@ import zipfile import shutil import os +from datetime import datetime, timedelta from . import config, util, nextbus, routeconfig, timetables @@ -82,6 +83,19 @@ def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_date=None): else: cache_dir = Path(gtfs_cache_dir) gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' + + # check if this zip file exists + loops = 0 + max_loops = 365 + gtfs_date_to_use = gtfs_date + while Path(gtfs_path).is_file() == False and loops < max_loops: + + # go back one day and re-represent date as a string + gtfs_date_to_use = (datetime.strptime(gtfs_date_to_use, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d') + gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' + + loops += 1 + zip_path = gtfs_path with zipfile.ZipFile(zip_path, 'r') as zip_ref: From 4d568e4d07924dccd52c7d7c2f46590d35fa9289 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 14:39:21 -0700 Subject: [PATCH 28/55] date suffix now matches actual date found and used --- backend/models/gtfs.py | 5 +---- backend/save_routes.py | 29 ++++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 138e89c4..74acdf16 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -54,8 +54,6 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde } - -# dont forget to refactor to remove repetition with download_gtfs_data def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_date=None): if gtfs_date == None: cache_dir = Path(gtfs_cache_dir) @@ -93,6 +91,7 @@ def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_date=None): # go back one day and re-represent date as a string gtfs_date_to_use = (datetime.strptime(gtfs_date_to_use, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d') gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' + gtfs_cache_dir_to_use = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' loops += 1 @@ -291,8 +290,6 @@ def save_timetables(self, save_to_s3=False, skip_existing=False): agency_id = self.agency_id dates_map = self.get_services_by_date() - ##bri## print('here\n\n\n',dates_map) - ##bri## exit() # # Typically, many dates have identical scheduled timetables (with times relative to midnight on that date). # Instead of storing redundant timetables for each date, store one timetable per route for each unique set of service_ids. diff --git a/backend/save_routes.py b/backend/save_routes.py index 5c3b54d8..90cc722c 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -1,8 +1,9 @@ -from models import gtfs, config +from models import gtfs, config, util from compute_stats import compute_stats_for_dates import argparse -from datetime import date, datetime +from datetime import date, datetime, timedelta import requests +from pathlib import Path from secrets import transitfeeds_api_key # you may have to create this # Downloads and parses the GTFS specification @@ -70,11 +71,29 @@ else: # save with date suffix, using the GTFS file provided save_to_s3=False - date_to_use=datetime.strptime(gtfs_date, "%Y-%m-%d").date() + date_to_use=datetime.strptime(gtfs_date, "%Y-%m-%d").date() + + gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' + + + # check if this zip file exists + loops = 0 + max_loops = 365 + gtfs_date_to_use = gtfs_date + while Path(gtfs_path).is_file() == False and loops < max_loops: + + # go back one day and re-represent date as a string + gtfs_date_to_use = (datetime.strptime(gtfs_date_to_use, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d') + gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' + gtfs_cache_dir_to_use = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' + + loops += 1 + + # save the routes - scraper = gtfs.GtfsScraper(agency, gtfs_date=gtfs_date) - scraper.save_routes(save_to_s3, date_to_use, version_date=gtfs_date) + scraper = gtfs.GtfsScraper(agency, gtfs_date=gtfs_date_to_use) + scraper.save_routes(save_to_s3, date_to_use, version_date=gtfs_date_to_use) errors += scraper.errors From d5a7cbc551142a7b29587f0563da455aee76aa03 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 14:43:44 -0700 Subject: [PATCH 29/55] removed duplicative checking for dated gtfs zipfile --- backend/models/gtfs.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 74acdf16..b904ba09 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -81,20 +81,6 @@ def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_date=None): else: cache_dir = Path(gtfs_cache_dir) gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' - - # check if this zip file exists - loops = 0 - max_loops = 365 - gtfs_date_to_use = gtfs_date - while Path(gtfs_path).is_file() == False and loops < max_loops: - - # go back one day and re-represent date as a string - gtfs_date_to_use = (datetime.strptime(gtfs_date_to_use, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d') - gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' - gtfs_cache_dir_to_use = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' - - loops += 1 - zip_path = gtfs_path with zipfile.ZipFile(zip_path, 'r') as zip_ref: From 0bf3cf2e55899c2a4ddcc8d0ddbe0f4781ddd6d7 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 14:52:30 -0700 Subject: [PATCH 30/55] pass gtfs_path to scraper instead of gtfs_date --- backend/models/gtfs.py | 9 ++++----- backend/save_routes.py | 3 ++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index b904ba09..68b916c5 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -54,8 +54,8 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde } -def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_date=None): - if gtfs_date == None: +def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_path=None): + if gtfs_path == None: cache_dir = Path(gtfs_cache_dir) zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' gtfs_url = agency.gtfs_url @@ -80,7 +80,6 @@ def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_date=None): else: cache_dir = Path(gtfs_cache_dir) - gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' zip_path = gtfs_path with zipfile.ZipFile(zip_path, 'r') as zip_ref: @@ -124,12 +123,12 @@ def contains_excluded_stop(shape_stop_ids, excluded_stop_ids): return False class GtfsScraper: - def __init__(self, agency: config.Agency, gtfs_date=None): + def __init__(self, agency: config.Agency, gtfs_path=None): self.agency = agency self.agency_id = agency_id = agency.id gtfs_cache_dir = f'{util.get_data_dir()}/gtfs-{agency_id}' - get_gtfs_data(agency, gtfs_cache_dir, gtfs_date=gtfs_date) + get_gtfs_data(agency, gtfs_cache_dir, gtfs_path=gtfs_path) self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) self.errors = [] diff --git a/backend/save_routes.py b/backend/save_routes.py index 90cc722c..40dacd74 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -92,7 +92,8 @@ # save the routes - scraper = gtfs.GtfsScraper(agency, gtfs_date=gtfs_date_to_use) + #scraper = gtfs.GtfsScraper(agency, gtfs_date=gtfs_date_to_use) + scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path) scraper.save_routes(save_to_s3, date_to_use, version_date=gtfs_date_to_use) errors += scraper.errors From e49646685335f69053c42bc1a916a62b1ad8dfcf Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 14:56:53 -0700 Subject: [PATCH 31/55] some cleanup --- backend/save_routes.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 40dacd74..8a4c6605 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -74,25 +74,18 @@ date_to_use=datetime.strptime(gtfs_date, "%Y-%m-%d").date() gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' - - # check if this zip file exists loops = 0 max_loops = 365 gtfs_date_to_use = gtfs_date while Path(gtfs_path).is_file() == False and loops < max_loops: - # go back one day and re-represent date as a string gtfs_date_to_use = (datetime.strptime(gtfs_date_to_use, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d') gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' gtfs_cache_dir_to_use = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' - loops += 1 - - # save the routes - #scraper = gtfs.GtfsScraper(agency, gtfs_date=gtfs_date_to_use) scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path) scraper.save_routes(save_to_s3, date_to_use, version_date=gtfs_date_to_use) errors += scraper.errors From 93b90b737af007ed53c4d1f743ef482d3d3c5267 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 15:16:22 -0700 Subject: [PATCH 32/55] fixed bug where save_routes.py broken without gtsf_date argument --- backend/save_routes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/save_routes.py b/backend/save_routes.py index 8a4c6605..38bea197 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -68,6 +68,8 @@ if gtfs_date is None: # save the normal way, downloading the most recent GTFS file date_to_use=date.today() + gtfs_date_to_use=date.today() + gtfs_path = None else: # save with date suffix, using the GTFS file provided save_to_s3=False From af1905549183e2c9d5c1796f7d33867d362f753e Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Mon, 27 Apr 2020 15:31:07 -0700 Subject: [PATCH 33/55] added a comment --- backend/save_routes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/save_routes.py b/backend/save_routes.py index 38bea197..158ce547 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -67,6 +67,7 @@ if gtfs_date is None: # save the normal way, downloading the most recent GTFS file + # should probably not be using both date_to_use and gtfs_date_to_use date_to_use=date.today() gtfs_date_to_use=date.today() gtfs_path = None From 483a0b28d9ec05505c46cdf7f473835c542d5d2f Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 29 Apr 2020 21:38:52 -0700 Subject: [PATCH 34/55] combined duplicative lines --- backend/models/gtfs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 68b916c5..65120db0 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -75,15 +75,15 @@ def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_path=None): with open(zip_path, 'wb') as f: f.write(r.content) - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall(gtfs_cache_dir) + #with zipfile.ZipFile(zip_path, 'r') as zip_ref: + # zip_ref.extractall(gtfs_cache_dir) else: cache_dir = Path(gtfs_cache_dir) zip_path = gtfs_path - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall(gtfs_cache_dir) + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(gtfs_cache_dir) def is_subsequence(smaller, bigger): From 2084742c9ba6ac86141713f22bd9cccd9d6fc36e Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 29 Apr 2020 21:43:38 -0700 Subject: [PATCH 35/55] changed command line argument gtfs_date to date --- backend/save_routes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 158ce547..eec0220a 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -45,7 +45,7 @@ parser.add_argument('--s3', dest='s3', action='store_true', help='store in s3') parser.add_argument('--timetables', dest='timetables', action='store_true', help='also save timetables') parser.add_argument('--scheduled-stats', dest='scheduled_stats', action='store_true', help='also compute scheduled stats if the timetable has new dates (requires --timetables)') - parser.add_argument('--gtfs_date', required=False) + parser.add_argument('--date', required=False) parser.set_defaults(s3=False) parser.set_defaults(timetables=False) parser.set_defaults(scheduled_stats=False) @@ -56,7 +56,7 @@ agencies = [config.get_agency(args.agency)] if args.agency is not None else config.agencies save_to_s3 = args.s3 - gtfs_date = args.gtfs_date + gtfs_date = args.date errors = [] From 385e20998c30427a3ab6dd25c6f4f972c73430e1 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 29 Apr 2020 23:12:53 -0700 Subject: [PATCH 36/55] changed the method of finding most recent gtfs zip --- backend/save_routes.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index eec0220a..117ea95e 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -5,6 +5,7 @@ import requests from pathlib import Path from secrets import transitfeeds_api_key # you may have to create this +import os # Downloads and parses the GTFS specification # and saves the configuration for all routes to S3. @@ -75,18 +76,27 @@ # save with date suffix, using the GTFS file provided save_to_s3=False date_to_use=datetime.strptime(gtfs_date, "%Y-%m-%d").date() - gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' - # check if this zip file exists - loops = 0 - max_loops = 365 - gtfs_date_to_use = gtfs_date - while Path(gtfs_path).is_file() == False and loops < max_loops: - # go back one day and re-represent date as a string - gtfs_date_to_use = (datetime.strptime(gtfs_date_to_use, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d') - gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' - gtfs_cache_dir_to_use = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date_to_use}.zip' - loops += 1 + + # find most recent zip file before gtfs_date + best_candidate_zip_file = "" + best_candidate_date = datetime.today() + smallest_timedelta_so_far = timedelta.max + for candidate_zip_file in os.listdir(util.get_data_dir()): + if f'gtfs-{agency.id}-' in candidate_zip_file and '.zip' in candidate_zip_file: + candidate_year = candidate_zip_file.split('-')[2] + candidate_month = candidate_zip_file.split('-')[3] + candidate_day = candidate_zip_file.split('-')[4] + candidate_day = candidate_day.split(".zip")[0] + candidate_date_string = candidate_year+"-"+candidate_month+"-"+candidate_day + candidate_date = datetime.strptime(candidate_date_string,"%Y-%m-%d").date() + if candidate_date - date_to_use <= smallest_timedelta_so_far and candidate_date <= date_to_use: + best_candidate_date = candidate_date + best_candidate_zip_file = candidate_zip_file + + gtfs_date_to_use = best_candidate_date + gtfs_path = best_candidate_zip_file + gtfs_path = f'{util.get_data_dir()}/{best_candidate_zip_file}' # save the routes scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path) From f35f14ac61576f55600ac502945920f07bbf2cd1 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 6 May 2020 18:41:46 -0700 Subject: [PATCH 37/55] combined two identical lines into one --- backend/models/gtfs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 65120db0..e66e6d44 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -55,8 +55,8 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_path=None): + cache_dir = Path(gtfs_cache_dir) if gtfs_path == None: - cache_dir = Path(gtfs_cache_dir) zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' gtfs_url = agency.gtfs_url @@ -79,7 +79,6 @@ def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_path=None): # zip_ref.extractall(gtfs_cache_dir) else: - cache_dir = Path(gtfs_cache_dir) zip_path = gtfs_path with zipfile.ZipFile(zip_path, 'r') as zip_ref: From e219b420b0128b86bb15dd8b3278413e30d97b47 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 6 May 2020 18:54:25 -0700 Subject: [PATCH 38/55] reduced if-else to just if --- backend/models/gtfs.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index e66e6d44..eec92faf 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -56,29 +56,25 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_path=None): cache_dir = Path(gtfs_cache_dir) - if gtfs_path == None: - zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' - gtfs_url = agency.gtfs_url + zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip' + gtfs_url = agency.gtfs_url - if gtfs_url is None: - raise Exception(f'agency {agency.id} does not have gtfs_url in config') + if gtfs_url is None: + raise Exception(f'agency {agency.id} does not have gtfs_url in config') - if not cache_dir.exists(): - print(f'downloading gtfs data from {gtfs_url}') - r = requests.get(gtfs_url) + if not cache_dir.exists(): + print(f'downloading gtfs data from {gtfs_url}') + r = requests.get(gtfs_url) - if r.status_code != 200: - raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}") + if r.status_code != 200: + raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}") - with open(zip_path, 'wb') as f: - f.write(r.content) + with open(zip_path, 'wb') as f: + f.write(r.content) - #with zipfile.ZipFile(zip_path, 'r') as zip_ref: - # zip_ref.extractall(gtfs_cache_dir) - - else: + if gtfs_path is not None: zip_path = gtfs_path with zipfile.ZipFile(zip_path, 'r') as zip_ref: From 198a3ae0948c93a0a92dd5e7e6fdc0cb9fe9cc98 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 6 May 2020 19:03:34 -0700 Subject: [PATCH 39/55] eliminated unecessary else keyword --- backend/models/routeconfig.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/backend/models/routeconfig.py b/backend/models/routeconfig.py index 568948e2..52775964 100644 --- a/backend/models/routeconfig.py +++ b/backend/models/routeconfig.py @@ -127,11 +127,9 @@ def get_cache_path(agency_id, version=DefaultVersion, version_date=None): # It has nothing to do with version=DefaultVersion if version_date == None: return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json' - else: - return f'{util.get_data_dir()}/routes_{version}_{agency_id}_{version_date}/routes_{version}_{agency_id}_{version_date}.json' - + + return f'{util.get_data_dir()}/routes_{version}_{agency_id}_{version_date}/routes_{version}_{agency_id}_{version_date}.json' - ##bri##return f"{util.get_data_dir()}/datekeys_{version}_{agency_id}/datekeys_{version}_{agency_id}.json" def get_s3_path(agency_id, version=DefaultVersion): From 8fcbb5bea928b0d09ad7608fe64dc8a8ecda42d0 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 6 May 2020 19:06:12 -0700 Subject: [PATCH 40/55] removed outdated comments --- backend/save_routes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 117ea95e..ea0f2377 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -61,9 +61,6 @@ errors = [] - # should probably change things so we supply gtfs_path - # instead of gtfs_date - for agency in agencies: if gtfs_date is None: From 46c99270b802f18358f04ef6828f2e7e53ae2002 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 6 May 2020 19:09:37 -0700 Subject: [PATCH 41/55] removed unecessary assignment of save_to_s3 --- backend/save_routes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index ea0f2377..f5dfba03 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -71,7 +71,6 @@ gtfs_path = None else: # save with date suffix, using the GTFS file provided - save_to_s3=False date_to_use=datetime.strptime(gtfs_date, "%Y-%m-%d").date() gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' From d4f8b72acbcc14712785548b5572fd4251d1c63c Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 6 May 2020 19:51:17 -0700 Subject: [PATCH 42/55] changed 'best_candidate' vars to 'recentmost_date_qualified' vars --- backend/save_routes.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index f5dfba03..89b5f491 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -74,9 +74,19 @@ date_to_use=datetime.strptime(gtfs_date, "%Y-%m-%d").date() gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' - # find most recent zip file before gtfs_date - best_candidate_zip_file = "" - best_candidate_date = datetime.today() + ''' + Find most recent zip file before gtfs_date. + recentmost_date_qualified_zip_file is: + "date qualified" and "recentmost" + + "date qualified" means the date of the file is no later than the date + argument given. + + "recentmost" means it is the most recent file that qualifies. + ''' + + recentmost_date_qualified_zip_file = "" + recentmost_date_qualified_date = datetime.today() smallest_timedelta_so_far = timedelta.max for candidate_zip_file in os.listdir(util.get_data_dir()): if f'gtfs-{agency.id}-' in candidate_zip_file and '.zip' in candidate_zip_file: @@ -87,12 +97,12 @@ candidate_date_string = candidate_year+"-"+candidate_month+"-"+candidate_day candidate_date = datetime.strptime(candidate_date_string,"%Y-%m-%d").date() if candidate_date - date_to_use <= smallest_timedelta_so_far and candidate_date <= date_to_use: - best_candidate_date = candidate_date - best_candidate_zip_file = candidate_zip_file + recentmost_date_qualified_date = candidate_date + recentmost_date_qualified_zip_file = candidate_zip_file - gtfs_date_to_use = best_candidate_date - gtfs_path = best_candidate_zip_file - gtfs_path = f'{util.get_data_dir()}/{best_candidate_zip_file}' + gtfs_date_to_use = recentmost_date_qualified_date + gtfs_path = recentmost_date_qualified_zip_file + gtfs_path = f'{util.get_data_dir()}/{recentmost_date_qualified_zip_file}' # save the routes scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path) From 2a4c8072b2cf8a351bc32d186a743fd1da5a7659 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Wed, 6 May 2020 20:00:10 -0700 Subject: [PATCH 43/55] setting starting date more appropriately to date argument --- backend/save_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 89b5f491..fca6662e 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -86,7 +86,7 @@ ''' recentmost_date_qualified_zip_file = "" - recentmost_date_qualified_date = datetime.today() + recentmost_date_qualified_date = gtfs_date smallest_timedelta_so_far = timedelta.max for candidate_zip_file in os.listdir(util.get_data_dir()): if f'gtfs-{agency.id}-' in candidate_zip_file and '.zip' in candidate_zip_file: From a5f5a62e806ab07a98fe3821b7c695510524a42c Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 21 May 2020 13:45:17 -0700 Subject: [PATCH 44/55] chained two lines into one --- backend/save_routes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index fca6662e..15d9d50b 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -92,8 +92,7 @@ if f'gtfs-{agency.id}-' in candidate_zip_file and '.zip' in candidate_zip_file: candidate_year = candidate_zip_file.split('-')[2] candidate_month = candidate_zip_file.split('-')[3] - candidate_day = candidate_zip_file.split('-')[4] - candidate_day = candidate_day.split(".zip")[0] + candidate_day = candidate_zip_file.split('-')[4].split(".zip")[0] candidate_date_string = candidate_year+"-"+candidate_month+"-"+candidate_day candidate_date = datetime.strptime(candidate_date_string,"%Y-%m-%d").date() if candidate_date - date_to_use <= smallest_timedelta_so_far and candidate_date <= date_to_use: From 4d24dfe9b91c2536517c1628937a9b134a80c6db Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 21 May 2020 13:48:25 -0700 Subject: [PATCH 45/55] eliminated unecessary import shutil --- backend/models/gtfs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index eec92faf..813690a3 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -8,7 +8,6 @@ import gzip import hashlib import zipfile -import shutil import os from datetime import datetime, timedelta From fdba9b7a6dc28855938c4b44418145457c1bdbd8 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 21 May 2020 13:56:20 -0700 Subject: [PATCH 46/55] changed all occurrences of version_date to gtfs_date --- backend/models/gtfs.py | 4 ++-- backend/models/routeconfig.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 813690a3..5544e4b5 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -1065,7 +1065,7 @@ def get_sort_key(route_data): return route_data['title'] return sorted(routes_data, key=get_sort_key) - def save_routes(self, save_to_s3, d, version_date=None): + def save_routes(self, save_to_s3, d, gtfs_date=None): agency = self.agency agency_id = agency.id routes_df = self.get_gtfs_routes() @@ -1085,4 +1085,4 @@ def save_routes(self, save_to_s3, d, version_date=None): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3, version_date=version_date) + routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3, gtfs_date=gtfs_date) diff --git a/backend/models/routeconfig.py b/backend/models/routeconfig.py index 52775964..928453c8 100644 --- a/backend/models/routeconfig.py +++ b/backend/models/routeconfig.py @@ -122,13 +122,11 @@ def get_directions_for_stop(self, stop_id): for s in direction['stops'] if s == stop_id ] -def get_cache_path(agency_id, version=DefaultVersion, version_date=None): - # version_date is for saving old versions of routes - # It has nothing to do with version=DefaultVersion - if version_date == None: +def get_cache_path(agency_id, version=DefaultVersion, gtfs_date=None): + if gtfs_date == None: return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json' - return f'{util.get_data_dir()}/routes_{version}_{agency_id}_{version_date}/routes_{version}_{agency_id}_{version_date}.json' + return f'{util.get_data_dir()}/routes_{version}_{agency_id}_{gtfs_date}/routes_{version}_{agency_id}_{gtfs_date}.json' @@ -187,13 +185,13 @@ def get_route_config(agency_id, route_id, version=DefaultVersion): return route return None -def save_routes(agency_id, routes, save_to_s3=False, version_date=None): +def save_routes(agency_id, routes, save_to_s3=False, gtfs_date=None): data_str = json.dumps({ 'version': DefaultVersion, 'routes': [route.data for route in routes] }, separators=(',', ':')) - cache_path = get_cache_path(agency_id, version_date=version_date) + cache_path = get_cache_path(agency_id, gtfs_date=gtfs_date) cache_dir = Path(cache_path).parent if not cache_dir.exists(): cache_dir.mkdir(parents = True, exist_ok = True) From d2be8d4279d2a5cb6907eca01bff9004850c8014 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 21 May 2020 14:08:01 -0700 Subject: [PATCH 47/55] changed one missed version_date to gtfs_date and removed unecessary imports --- backend/save_routes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 15d9d50b..166140df 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -2,9 +2,9 @@ from compute_stats import compute_stats_for_dates import argparse from datetime import date, datetime, timedelta -import requests -from pathlib import Path -from secrets import transitfeeds_api_key # you may have to create this +#import requests +#from pathlib import Path +#from secrets import transitfeeds_api_key # you may have to create this import os # Downloads and parses the GTFS specification @@ -105,7 +105,7 @@ # save the routes scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path) - scraper.save_routes(save_to_s3, date_to_use, version_date=gtfs_date_to_use) + scraper.save_routes(save_to_s3, date_to_use, gtfs_date=gtfs_date_to_use) errors += scraper.errors From 78e538601242dd2d90764d206df0ea09a1d38898 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 21 May 2020 14:09:21 -0700 Subject: [PATCH 48/55] removed unecessary imports --- backend/save_routes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 166140df..379edb4b 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -2,9 +2,6 @@ from compute_stats import compute_stats_for_dates import argparse from datetime import date, datetime, timedelta -#import requests -#from pathlib import Path -#from secrets import transitfeeds_api_key # you may have to create this import os # Downloads and parses the GTFS specification From ed4c4d77375e7d4f12ccdb0a22ac740589552401 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 21 May 2020 14:19:59 -0700 Subject: [PATCH 49/55] improved the comment --- backend/save_routes.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 379edb4b..7fc0f45d 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -33,9 +33,11 @@ #} # # -# Currently the script just overwrites the one S3 path, but this process could be extended in the future to -# store different paths for different dates, to allow fetching historical data for route configurations. -# UPDATE: We are now saving some older routes in versioned directories in metrics-mvp/backend/data +# When no date is provided the script just overwrites the one S3 path +# representing the recentmost GTFS that an agency has made available that # is active. Providing the date adds _YYYY-MM-DD to the routes path, +# which would allow the backend to use versioned route files. + + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Save route configuration from GTFS and possibly Nextbus API') From 3b30c6848e8ae61a06bbe7dd6339ec5fb5dea85c Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 21 May 2020 16:02:06 -0700 Subject: [PATCH 50/55] removed unecessary parameter from save_routes method --- backend/models/gtfs.py | 4 ++-- backend/save_routes.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 5544e4b5..4bde9826 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -1065,7 +1065,7 @@ def get_sort_key(route_data): return route_data['title'] return sorted(routes_data, key=get_sort_key) - def save_routes(self, save_to_s3, d, gtfs_date=None): + def save_routes(self, save_to_s3, d): agency = self.agency agency_id = agency.id routes_df = self.get_gtfs_routes() @@ -1085,4 +1085,4 @@ def save_routes(self, save_to_s3, d, gtfs_date=None): routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data] - routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3, gtfs_date=gtfs_date) + routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3, gtfs_date=d) diff --git a/backend/save_routes.py b/backend/save_routes.py index 7fc0f45d..9c271d38 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -67,6 +67,7 @@ # should probably not be using both date_to_use and gtfs_date_to_use date_to_use=date.today() gtfs_date_to_use=date.today() + gtfs_date = date.today() gtfs_path = None else: # save with date suffix, using the GTFS file provided @@ -98,13 +99,12 @@ recentmost_date_qualified_date = candidate_date recentmost_date_qualified_zip_file = candidate_zip_file - gtfs_date_to_use = recentmost_date_qualified_date - gtfs_path = recentmost_date_qualified_zip_file + gtfs_date = recentmost_date_qualified_date gtfs_path = f'{util.get_data_dir()}/{recentmost_date_qualified_zip_file}' # save the routes scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path) - scraper.save_routes(save_to_s3, date_to_use, gtfs_date=gtfs_date_to_use) + scraper.save_routes(save_to_s3, gtfs_date) errors += scraper.errors From 7b81c1aa17bd52f4f6c96a3921067d1d8e54851f Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Thu, 21 May 2020 16:10:21 -0700 Subject: [PATCH 51/55] simplified vars - removed date_to_use --- backend/save_routes.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 9c271d38..69b94786 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -64,14 +64,11 @@ if gtfs_date is None: # save the normal way, downloading the most recent GTFS file - # should probably not be using both date_to_use and gtfs_date_to_use - date_to_use=date.today() - gtfs_date_to_use=date.today() - gtfs_date = date.today() + gtfs_date=date.today() gtfs_path = None else: # save with date suffix, using the GTFS file provided - date_to_use=datetime.strptime(gtfs_date, "%Y-%m-%d").date() + gtfs_date=datetime.strptime(gtfs_date, "%Y-%m-%d").date() gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' ''' @@ -95,7 +92,7 @@ candidate_day = candidate_zip_file.split('-')[4].split(".zip")[0] candidate_date_string = candidate_year+"-"+candidate_month+"-"+candidate_day candidate_date = datetime.strptime(candidate_date_string,"%Y-%m-%d").date() - if candidate_date - date_to_use <= smallest_timedelta_so_far and candidate_date <= date_to_use: + if candidate_date - gtfs_date <= smallest_timedelta_so_far and candidate_date <= gtfs_date: recentmost_date_qualified_date = candidate_date recentmost_date_qualified_zip_file = candidate_zip_file From 8a7443c7806fd7dcf714772ee1928958c4a66794 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Fri, 22 May 2020 01:06:54 -0700 Subject: [PATCH 52/55] added error msg for dated gtfs file not found and moved code into new function --- backend/save_routes.py | 60 +++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index 69b94786..20dbf6ea 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -38,6 +38,38 @@ # which would allow the backend to use versioned route files. +def get_recentmost_date_qualified_gtfs_path(gtfs_date): + ''' + Find most recent zip file before gtfs_date. + recentmost_date_qualified_zip_file is: + "date qualified" and "recentmost" + + "date qualified" means the date of the file is no later than the date + argument given. + + "recentmost" means it is the most recent file that qualifies. + ''' + + recentmost_date_qualified_zip_file = "" + recentmost_date_qualified_date = gtfs_date + smallest_timedelta_so_far = timedelta.max + for candidate_zip_file in os.listdir(util.get_data_dir()): + if f'gtfs-{agency.id}-' in candidate_zip_file and '.zip' in candidate_zip_file: + candidate_year = candidate_zip_file.split('-')[2] + candidate_month = candidate_zip_file.split('-')[3] + candidate_day = candidate_zip_file.split('-')[4].split(".zip")[0] + candidate_date_string = candidate_year+"-"+candidate_month+"-"+candidate_day + candidate_date = datetime.strptime(candidate_date_string,"%Y-%m-%d").date() + if candidate_date - gtfs_date <= smallest_timedelta_so_far and candidate_date <= gtfs_date: + recentmost_date_qualified_date = candidate_date + recentmost_date_qualified_zip_file = candidate_zip_file + + gtfs_date = recentmost_date_qualified_date + gtfs_path = f'{util.get_data_dir()}/{recentmost_date_qualified_zip_file}' + if recentmost_date_qualified_zip_file == "": + print("an active GTFS for this date was not found") + raise SystemExit + return gtfs_path, gtfs_date if __name__ == '__main__': parser = argparse.ArgumentParser(description='Save route configuration from GTFS and possibly Nextbus API') @@ -71,33 +103,7 @@ gtfs_date=datetime.strptime(gtfs_date, "%Y-%m-%d").date() gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' - ''' - Find most recent zip file before gtfs_date. - recentmost_date_qualified_zip_file is: - "date qualified" and "recentmost" - - "date qualified" means the date of the file is no later than the date - argument given. - - "recentmost" means it is the most recent file that qualifies. - ''' - - recentmost_date_qualified_zip_file = "" - recentmost_date_qualified_date = gtfs_date - smallest_timedelta_so_far = timedelta.max - for candidate_zip_file in os.listdir(util.get_data_dir()): - if f'gtfs-{agency.id}-' in candidate_zip_file and '.zip' in candidate_zip_file: - candidate_year = candidate_zip_file.split('-')[2] - candidate_month = candidate_zip_file.split('-')[3] - candidate_day = candidate_zip_file.split('-')[4].split(".zip")[0] - candidate_date_string = candidate_year+"-"+candidate_month+"-"+candidate_day - candidate_date = datetime.strptime(candidate_date_string,"%Y-%m-%d").date() - if candidate_date - gtfs_date <= smallest_timedelta_so_far and candidate_date <= gtfs_date: - recentmost_date_qualified_date = candidate_date - recentmost_date_qualified_zip_file = candidate_zip_file - - gtfs_date = recentmost_date_qualified_date - gtfs_path = f'{util.get_data_dir()}/{recentmost_date_qualified_zip_file}' + gtfs_path, gtfs_date = get_recentmost_date_qualified_gtfs_path(gtfs_date) # save the routes scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path) From de0b556c50e34d35e78aa88812f5a4fadc5cfcf0 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Fri, 22 May 2020 01:25:27 -0700 Subject: [PATCH 53/55] corrected inconsistency -YYYY-MM-DD vs _YYYY-MM-DD in routes path --- backend/models/routeconfig.py | 2 +- backend/save_routes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/models/routeconfig.py b/backend/models/routeconfig.py index 928453c8..726a3f87 100644 --- a/backend/models/routeconfig.py +++ b/backend/models/routeconfig.py @@ -126,7 +126,7 @@ def get_cache_path(agency_id, version=DefaultVersion, gtfs_date=None): if gtfs_date == None: return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json' - return f'{util.get_data_dir()}/routes_{version}_{agency_id}_{gtfs_date}/routes_{version}_{agency_id}_{gtfs_date}.json' + return f'{util.get_data_dir()}/routes_{version}_{agency_id}-{gtfs_date}/routes_{version}_{agency_id}-{gtfs_date}.json' diff --git a/backend/save_routes.py b/backend/save_routes.py index 20dbf6ea..cf1171f3 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -34,7 +34,7 @@ # # # When no date is provided the script just overwrites the one S3 path -# representing the recentmost GTFS that an agency has made available that # is active. Providing the date adds _YYYY-MM-DD to the routes path, +# representing the recentmost GTFS that an agency has made available that # is active. Providing the date adds -YYYY-MM-DD to the routes path, # which would allow the backend to use versioned route files. From 56396103118ab9bfd15ab4c56e411ff565323f38 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Fri, 22 May 2020 01:41:39 -0700 Subject: [PATCH 54/55] fixed introduced bug resetting gtfs_path and gtfs_date outside of ELSE --- backend/save_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/save_routes.py b/backend/save_routes.py index cf1171f3..ec9e43d1 100644 --- a/backend/save_routes.py +++ b/backend/save_routes.py @@ -103,7 +103,7 @@ def get_recentmost_date_qualified_gtfs_path(gtfs_date): gtfs_date=datetime.strptime(gtfs_date, "%Y-%m-%d").date() gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip' - gtfs_path, gtfs_date = get_recentmost_date_qualified_gtfs_path(gtfs_date) + gtfs_path, gtfs_date = get_recentmost_date_qualified_gtfs_path(gtfs_date) # save the routes scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path) From db1bacc28ca250c31a93e65884d33a7e767773c1 Mon Sep 17 00:00:00 2001 From: Brian-Lee Date: Fri, 22 May 2020 10:39:08 -0700 Subject: [PATCH 55/55] conditionally load gtfs data from the cache-dir or gtfs_path --- backend/models/gtfs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/models/gtfs.py b/backend/models/gtfs.py index 4bde9826..f8ecce8b 100644 --- a/backend/models/gtfs.py +++ b/backend/models/gtfs.py @@ -124,7 +124,11 @@ def __init__(self, agency: config.Agency, gtfs_path=None): get_gtfs_data(agency, gtfs_cache_dir, gtfs_path=gtfs_path) - self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) + if gtfs_path is None: + self.feed = ptg.load_geo_feed(gtfs_cache_dir, {}) + else: + self.feed = ptg.load_geo_feed(gtfs_path, {}) + self.errors = [] self.stop_times_by_trip = None self.stops_df = None