trynmaps · Brian-Lee · Apr 22, 2020 · Apr 22, 2020 · Apr 22, 2020 · Apr 22, 2020
@@ -8,6 +8,8 @@
 import gzip
 import hashlib
 import zipfile
+import os
+from datetime import datetime, timedelta
 
 from . import config, util, nextbus, routeconfig, timetables
 
@@ -49,28 +51,35 @@ def get_stop_geometry(stop_xy, shape_lines_xy, shape_cumulative_dist, start_inde
         'after_index': best_index, # the index of the coordinate of the shape just before this stop
         'offset': int(best_offset) # distance in meters between this stop and the closest line segment of shape
     }
+
 
-def download_gtfs_data(agency: config.Agency, gtfs_cache_dir):
+def get_gtfs_data(agency: config.Agency, gtfs_cache_dir, gtfs_path=None):
+    cache_dir = Path(gtfs_cache_dir)
+    zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip'
     gtfs_url = agency.gtfs_url
+
+
     if gtfs_url is None:
         raise Exception(f'agency {agency.id} does not have gtfs_url in config')
 
-    cache_dir = Path(gtfs_cache_dir)
+
     if not cache_dir.exists():
         print(f'downloading gtfs data from {gtfs_url}')
         r = requests.get(gtfs_url)
 
         if r.status_code != 200:
             raise Exception(f"Error fetching {gtfs_url}: HTTP {r.status_code}: {r.text}")
 
-        zip_path = f'{util.get_data_dir()}/gtfs-{agency.id}.zip'
-
         with open(zip_path, 'wb') as f:
             f.write(r.content)
 
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(gtfs_cache_dir)
+    if gtfs_path is not None:
+        zip_path = gtfs_path
 
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(gtfs_cache_dir)
+
+
 def is_subsequence(smaller, bigger):
     smaller_len = len(smaller)
     bigger_len = len(bigger)
@@ -108,15 +117,14 @@ def contains_excluded_stop(shape_stop_ids, excluded_stop_ids):
     return False
 
 class GtfsScraper:
-    def __init__(self, agency: config.Agency):
+    def __init__(self, agency: config.Agency, gtfs_path=None):
         self.agency = agency
         self.agency_id = agency_id = agency.id
         gtfs_cache_dir = f'{util.get_data_dir()}/gtfs-{agency_id}'
 
-        download_gtfs_data(agency, gtfs_cache_dir)
+        get_gtfs_data(agency, gtfs_cache_dir, gtfs_path=gtfs_path)
 
         self.feed = ptg.load_geo_feed(gtfs_cache_dir, {})
-
         self.errors = []
         self.stop_times_by_trip = None
         self.stops_df = None
@@ -261,7 +269,6 @@ def save_timetables(self, save_to_s3=False, skip_existing=False):
         agency_id = self.agency_id
 
         dates_map = self.get_services_by_date()
-
         #
         # Typically, many dates have identical scheduled timetables (with times relative to midnight on that date).
         # Instead of storing redundant timetables for each date, store one timetable per route for each unique set of service_ids.
@@ -1078,4 +1085,4 @@ def save_routes(self, save_to_s3, d):
 
         routes = [routeconfig.RouteConfig(agency_id, route_data) for route_data in routes_data]
 
-        routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3)
+        routeconfig.save_routes(agency_id, routes, save_to_s3=save_to_s3, gtfs_date=d)
@@ -1,4 +1,5 @@
 import re, os, time, requests, json, boto3, gzip
+from pathlib import Path
 from . import util, config
 
 DefaultVersion = 'v3a'
@@ -121,8 +122,13 @@ def get_directions_for_stop(self, stop_id):
             for s in direction['stops'] if s == stop_id
         ]
 
-def get_cache_path(agency_id, version=DefaultVersion):
-    return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json'
+def get_cache_path(agency_id, version=DefaultVersion, gtfs_date=None):
+    if gtfs_date == None:
+        return f'{util.get_data_dir()}/routes_{version}_{agency_id}.json'
+
+    return f'{util.get_data_dir()}/routes_{version}_{agency_id}_{gtfs_date}/routes_{version}_{agency_id}_{gtfs_date}.json'
+
+
 
 def get_s3_path(agency_id, version=DefaultVersion):
     return f'routes/{version}/routes_{version}_{agency_id}.json.gz'
@@ -179,14 +185,17 @@ def get_route_config(agency_id, route_id, version=DefaultVersion):
             return route
     return None
 
-def save_routes(agency_id, routes, save_to_s3=False):
+def save_routes(agency_id, routes, save_to_s3=False, gtfs_date=None):
     data_str = json.dumps({
         'version': DefaultVersion,
         'routes': [route.data for route in routes]
     }, separators=(',', ':'))
 
-    cache_path = get_cache_path(agency_id)
-
+    cache_path = get_cache_path(agency_id, gtfs_date=gtfs_date)
+    cache_dir = Path(cache_path).parent
+    if not cache_dir.exists():
+        cache_dir.mkdir(parents = True, exist_ok = True)
+
     with open(cache_path, "w") as f:
         f.write(data_str)
 

@@ -1,7 +1,8 @@
-from models import gtfs, config
+from models import gtfs, config, util
 from compute_stats import compute_stats_for_dates
 import argparse
-from datetime import date
+from datetime import date, datetime, timedelta
+import os
 
 # Downloads and parses the GTFS specification
 # and saves the configuration for all routes to S3.
@@ -32,41 +33,85 @@
 #}
 #
 #
-# Currently the script just overwrites the one S3 path, but this process could be extended in the future to
-# store different paths for different dates, to allow fetching historical data for route configurations.
-#
+# When no date is provided the script just overwrites the one S3 path 
+# representing the recentmost GTFS that an agency has made available that # is active. Providing the date adds _YYYY-MM-DD to the routes path, 
+# which would allow the backend to use versioned route files.
+
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Save route configuration from GTFS and possibly Nextbus API')
     parser.add_argument('--agency', required=False, help='Agency ID')
     parser.add_argument('--s3', dest='s3', action='store_true', help='store in s3')
     parser.add_argument('--timetables', dest='timetables', action='store_true', help='also save timetables')
     parser.add_argument('--scheduled-stats', dest='scheduled_stats', action='store_true', help='also compute scheduled stats if the timetable has new dates (requires --timetables)')
+    parser.add_argument('--date', required=False)	
     parser.set_defaults(s3=False)
     parser.set_defaults(timetables=False)
     parser.set_defaults(scheduled_stats=False)
+    parser.set_defaults(gtfs_date=None)	
 
     args = parser.parse_args()
 
     agencies = [config.get_agency(args.agency)] if args.agency is not None else config.agencies
 
     save_to_s3 = args.s3
-    d = date.today()
+    gtfs_date = args.date 
 
     errors = []
-
+	
     for agency in agencies:
-        scraper = gtfs.GtfsScraper(agency)
-        scraper.save_routes(save_to_s3, d)
+
+        if gtfs_date is None:
+            # save the normal way, downloading the most recent GTFS file
+            gtfs_date=date.today()
+            gtfs_path = None
+        else:
+            # save with date suffix, using the GTFS file provided
+            gtfs_date=datetime.strptime(gtfs_date, "%Y-%m-%d").date()	
+            gtfs_path = f'{util.get_data_dir()}/gtfs-{agency.id}-{gtfs_date}.zip'
+
+            ''' 
+            Find most recent zip file before gtfs_date.
+            recentmost_date_qualified_zip_file is:
+                "date qualified" and "recentmost"
+
+            "date qualified" means the date of the file is no later than the date
+            argument given.
+
+            "recentmost" means it is the most recent file that qualifies.
+            '''
 
+            recentmost_date_qualified_zip_file = ""
+            recentmost_date_qualified_date = gtfs_date
+            smallest_timedelta_so_far = timedelta.max
+            for candidate_zip_file in os.listdir(util.get_data_dir()):
+                if f'gtfs-{agency.id}-' in candidate_zip_file and '.zip' in candidate_zip_file:
+                    candidate_year = candidate_zip_file.split('-')[2]
+                    candidate_month = candidate_zip_file.split('-')[3]
+                    candidate_day = candidate_zip_file.split('-')[4].split(".zip")[0]
+                    candidate_date_string = candidate_year+"-"+candidate_month+"-"+candidate_day
+                    candidate_date = datetime.strptime(candidate_date_string,"%Y-%m-%d").date()
+                    if candidate_date - gtfs_date <= smallest_timedelta_so_far and candidate_date <= gtfs_date:
+                        recentmost_date_qualified_date = candidate_date
+                        recentmost_date_qualified_zip_file = candidate_zip_file
+
+            gtfs_date = recentmost_date_qualified_date
+            gtfs_path = f'{util.get_data_dir()}/{recentmost_date_qualified_zip_file}'
+
+        # save the routes
+        scraper = gtfs.GtfsScraper(agency, gtfs_path=gtfs_path)		
+        scraper.save_routes(save_to_s3, gtfs_date)	
+        errors += scraper.errors
+
+
         if args.timetables:
             timetables_updated = scraper.save_timetables(save_to_s3=save_to_s3, skip_existing=True)
 
             if timetables_updated and args.scheduled_stats:
                 dates = sorted(scraper.get_services_by_date().keys())
                 compute_stats_for_dates(dates, agency, scheduled=True, save_to_s3=save_to_s3)
 
-        errors += scraper.errors
 
     if errors:
         raise Exception("\n".join(errors))