-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: automate gtfs & gbfs data preprocessing for analytics (#713)
- Loading branch information
Showing
19 changed files
with
1,800 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from typing import Dict | ||
|
||
from database_gen.sqlacodegen_models import Feed | ||
|
||
|
||
def translate_feed_locations(feed: Feed, location_translations: Dict): | ||
""" | ||
Translate the locations of a feed. | ||
:param feed: The feed object | ||
:param location_translations: The location translations | ||
""" | ||
for location in feed.locations: | ||
location_translation = location_translations.get(location.id) | ||
|
||
if location_translation: | ||
location.subdivision_name = ( | ||
location_translation["subdivision_name_translation"] | ||
if location_translation["subdivision_name_translation"] | ||
else location.subdivision_name | ||
) | ||
location.municipality = ( | ||
location_translation["municipality_translation"] | ||
if location_translation["municipality_translation"] | ||
else location.municipality | ||
) | ||
location.country = ( | ||
location_translation["country_translation"] | ||
if location_translation["country_translation"] | ||
else location.country | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import unittest | ||
from unittest.mock import MagicMock | ||
from database_gen.sqlacodegen_models import Feed, Location | ||
from helpers.locations import translate_feed_locations | ||
|
||
|
||
class TestTranslateFeedLocations(unittest.TestCase): | ||
def test_translate_feed_locations(self): | ||
# Mock a location object with specific attributes | ||
mock_location = MagicMock(spec=Location) | ||
mock_location.id = 1 | ||
mock_location.subdivision_name = "Original Subdivision" | ||
mock_location.municipality = "Original Municipality" | ||
mock_location.country = "Original Country" | ||
|
||
# Mock a feed object with locations | ||
mock_feed = MagicMock(spec=Feed) | ||
mock_feed.locations = [mock_location] | ||
|
||
# Define a translation dictionary | ||
location_translations = { | ||
1: { | ||
"subdivision_name_translation": "Translated Subdivision", | ||
"municipality_translation": "Translated Municipality", | ||
"country_translation": "Translated Country", | ||
} | ||
} | ||
|
||
# Call the translate_feed_locations function | ||
translate_feed_locations(mock_feed, location_translations) | ||
|
||
# Assert that the location's attributes were updated with translations | ||
self.assertEqual(mock_location.subdivision_name, "Translated Subdivision") | ||
self.assertEqual(mock_location.municipality, "Translated Municipality") | ||
self.assertEqual(mock_location.country, "Translated Country") | ||
|
||
def test_translate_feed_locations_with_missing_translations(self): | ||
# Mock a location object with specific attributes | ||
mock_location = MagicMock(spec=Location) | ||
mock_location.id = 1 | ||
mock_location.subdivision_name = "Original Subdivision" | ||
mock_location.municipality = "Original Municipality" | ||
mock_location.country = "Original Country" | ||
|
||
# Mock a feed object with locations | ||
mock_feed = MagicMock(spec=Feed) | ||
mock_feed.locations = [mock_location] | ||
|
||
# Define a translation dictionary with missing translations | ||
location_translations = { | ||
1: { | ||
"subdivision_name_translation": None, | ||
"municipality_translation": None, | ||
"country_translation": "Translated Country", | ||
} | ||
} | ||
|
||
# Call the translate_feed_locations function | ||
translate_feed_locations(mock_feed, location_translations) | ||
|
||
# Assert that the location's attributes were updated correctly | ||
self.assertEqual( | ||
mock_location.subdivision_name, "Original Subdivision" | ||
) # No translation | ||
self.assertEqual( | ||
mock_location.municipality, "Original Municipality" | ||
) # No translation | ||
self.assertEqual(mock_location.country, "Translated Country") # Translated | ||
|
||
def test_translate_feed_locations_with_no_translation(self): | ||
# Mock a location object with specific attributes | ||
mock_location = MagicMock(spec=Location) | ||
mock_location.id = 1 | ||
mock_location.subdivision_name = "Original Subdivision" | ||
mock_location.municipality = "Original Municipality" | ||
mock_location.country = "Original Country" | ||
|
||
# Mock a feed object with locations | ||
mock_feed = MagicMock(spec=Feed) | ||
mock_feed.locations = [mock_location] | ||
|
||
# Define an empty translation dictionary | ||
location_translations = {} | ||
|
||
# Call the translate_feed_locations function | ||
translate_feed_locations(mock_feed, location_translations) | ||
|
||
# Assert that the location's attributes remain unchanged | ||
self.assertEqual(mock_location.subdivision_name, "Original Subdivision") | ||
self.assertEqual(mock_location.municipality, "Original Municipality") | ||
self.assertEqual(mock_location.country, "Original Country") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
[run] | ||
omit = | ||
*/test*/* | ||
*/helpers/* | ||
*/database_gen/* | ||
|
||
[report] | ||
exclude_lines = | ||
if __name__ == .__main__.: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
# GTFS & GBFS Analytics Processor | ||
|
||
This directory contains Google Cloud Functions that automate the retrieval, processing, and analytics generation for GTFS and GBFS datasets. The project is designed to handle and analyze both GTFS and GBFS data, storing the results in Google Cloud Storage. | ||
|
||
## Overview | ||
|
||
### `process_analytics_gtfs` | ||
|
||
This HTTP-triggered Cloud Function processes GTFS datasets by performing the following steps: | ||
|
||
1. **Retrieving Data**: Fetches the latest GTFS dataset per feed from the database. | ||
2. **Processing Data**: Analyzes the dataset, extracting metrics related to validation notices, features, and geographical locations. | ||
3. **Storing Analytics**: Saves the processed data as JSON files in the Google Cloud Storage bucket, updating metrics and analytics files. | ||
|
||
#### Files Modified/Created: | ||
- **`analytics_YYYY-MM-DD.json`**: Contains the GTFS analytics data for the specific date in JSON format. | ||
**Format:** | ||
```json | ||
{ | ||
"feed_id": "string", | ||
"dataset_id": "string", | ||
"notices": { | ||
"errors": ["string"], | ||
"warnings": ["string"], | ||
"infos": ["string"] | ||
}, | ||
"features": ["string"], | ||
"created_on": "datetime", | ||
"last_modified": "datetime", | ||
"provider": "string", | ||
"locations": [ | ||
{ | ||
"country_code": "string", | ||
"country": "string", | ||
"municipality": "string", | ||
"subdivision_name": "string" | ||
} | ||
] | ||
} | ||
``` | ||
|
||
- **`feed_metrics.json`**: Stores aggregated feed-level metrics, including error, warning, and info counts. | ||
**Format:** | ||
```json | ||
{ | ||
"feed_id": "string", | ||
"computed_on": ["datetime"], | ||
"errors_count": ["int"], | ||
"warnings_count": ["int"], | ||
"infos_count": ["int"] | ||
} | ||
``` | ||
|
||
- **`features_metrics.json`**: Tracks feature usage across feeds, showing the number of feeds using specific features. | ||
**Format:** | ||
```json | ||
{ | ||
"feature": "string", | ||
"computed_on": ["datetime"], | ||
"feeds_count": ["int"] | ||
} | ||
``` | ||
|
||
- **`notices_metrics.json`**: Records notice metrics by severity level (error, warning, info). | ||
**Format:** | ||
```json | ||
{ | ||
"notice": "string", | ||
"severity": "string", | ||
"computed_on": ["datetime"], | ||
"feeds_count": ["int"] | ||
} | ||
``` | ||
|
||
- **`summary/summary_YYYY-MM-DD.json`**: Contains aggregated metrics for the specific date, including feed metrics, feature metrics, and notice metrics. | ||
**Format:** | ||
```json | ||
{ | ||
"feed_metrics": [...], | ||
"features_metrics": [...], | ||
"notices_metrics": [...] | ||
} | ||
``` | ||
|
||
- **`analytics_files.json`**: Index of all `analytics_YYYY-MM-DD.json` files stored in the bucket. | ||
**Format:** | ||
```json | ||
{ | ||
"file_name": "string", | ||
"created_on": "datetime" | ||
} | ||
``` | ||
|
||
### `process_analytics_gbfs` | ||
|
||
This HTTP-triggered Cloud Function processes GBFS datasets by performing the following steps: | ||
|
||
1. **Retrieving Data**: Fetches the latest GBFS snapshot per feed from the database. | ||
2. **Processing Data**: Analyzes the snapshot, extracting metrics related to validation notices, versions, and geographical locations. | ||
3. **Storing Analytics**: Saves the processed data as JSON files in the Google Cloud Storage bucket, updating metrics and analytics files. | ||
|
||
#### Files Modified/Created: | ||
- **`analytics_YYYY-MM-DD.json`**: Contains the GBFS analytics data for the specific date in JSON format. | ||
**Format:** | ||
```json | ||
{ | ||
"feed_id": "string", | ||
"snapshot_id": "string", | ||
"notices": [ | ||
{ | ||
"keyword": "string", | ||
"gbfs_file": "string", | ||
"schema_path": "string" | ||
} | ||
], | ||
"created_on": "datetime", | ||
"operator": "string", | ||
"locations": [ | ||
{ | ||
"country_code": "string", | ||
"country": "string", | ||
"municipality": "string", | ||
"subdivision_name": "string" | ||
} | ||
] | ||
} | ||
``` | ||
|
||
- **`feed_metrics.json`**: Stores aggregated feed-level metrics, including error counts. | ||
**Format:** | ||
```json | ||
{ | ||
"feed_id": "string", | ||
"computed_on": ["datetime"], | ||
"errors_count": ["int"] | ||
} | ||
``` | ||
|
||
- **`versions_metrics.json`**: Tracks the usage of different GBFS versions across feeds. | ||
**Format:** | ||
```json | ||
{ | ||
"version": "string", | ||
"computed_on": ["datetime"], | ||
"feeds_count": ["int"] | ||
} | ||
``` | ||
|
||
- **`notices_metrics.json`**: Records notice metrics specific to GBFS, categorized by keyword, file, and schema path. | ||
**Format:** | ||
```json | ||
{ | ||
"keyword": "string", | ||
"gbfs_file": "string", | ||
"schema_path": "string", | ||
"computed_on": ["datetime"], | ||
"feeds_count": ["int"] | ||
} | ||
``` | ||
|
||
- **`summary/summary_YYYY-MM-DD.json`**: Contains aggregated metrics for the specific date, including feed metrics, version metrics, and notice metrics. | ||
**Format:** | ||
```json | ||
{ | ||
"feed_metrics": [...], | ||
"versions_metrics": [...], | ||
"notices_metrics": [...] | ||
} | ||
``` | ||
|
||
- **`analytics_files.json`**: Index of all `analytics_YYYY-MM-DD.json` files stored in the bucket. | ||
**Format:** | ||
```json | ||
{ | ||
"file_name": "string", | ||
"created_on": "datetime" | ||
} | ||
``` | ||
|
||
## Project Structure | ||
|
||
- **`main.py`**: Defines the HTTP-triggered Cloud Functions that initiate the GTFS and GBFS data analytics processes. | ||
- **`processors/base_analytics_processor.py`**: Contains the base class for analytics processing, providing common logic for GTFS and GBFS processors. | ||
- **`processors/gtfs_analytics_processor.py`**: Implements GTFS-specific data retrieval and processing logic. | ||
- **`processors/gbfs_analytics_processor.py`**: Implements GBFS-specific data retrieval and processing logic. | ||
- **`tests/`**: Unit tests for all modules and functions, ensuring correct functionality and robustness. | ||
|
||
## Project Configuration | ||
|
||
The following environment variables need to be set: | ||
|
||
- `FEEDS_DATABASE_URL`: The URL for the database containing GTFS and GBFS feeds. | ||
- `ANALYTICS_BUCKET`: The name of the Google Cloud Storage bucket where analytics results are stored. | ||
|
||
## Local Development | ||
|
||
Refer to the main [README.md](../README.md) for general setup instructions for the development environment. |
20 changes: 20 additions & 0 deletions
20
functions-python/preprocessed_analytics/function_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
{ | ||
"name": "preprocess-analytics", | ||
"description": "Preprocess analytics", | ||
"entry_point": "preprocess_analytics", | ||
"timeout": 540, | ||
"memory": "2Gi", | ||
"trigger_http": false, | ||
"include_folders": ["database_gen", "helpers"], | ||
"environment_variables": [], | ||
"secret_environment_variables": [ | ||
{ | ||
"key": "FEEDS_DATABASE_URL" | ||
} | ||
], | ||
"ingress_settings": "ALLOW_ALL", | ||
"max_instance_request_concurrency": 1, | ||
"max_instance_count": 5, | ||
"min_instance_count": 0, | ||
"available_cpu": 1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
functions-framework==3.* | ||
google-cloud-logging | ||
google-cloud-bigquery | ||
google-cloud-storage | ||
psycopg2-binary==2.9.6 | ||
aiohttp~=3.8.6 | ||
asyncio~=3.4.3 | ||
urllib3~=2.1.0 | ||
SQLAlchemy==2.0.23 | ||
geoalchemy2==0.14.7 | ||
requests~=2.31.0 | ||
attrs~=23.1.0 | ||
pluggy~=1.3.0 | ||
certifi~=2023.7.22 | ||
pandas |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Faker | ||
pytest~=7.4.3 | ||
urllib3-mock | ||
requests-mock |
Empty file.
Oops, something went wrong.