Skip to content

Commit

Permalink
feat: automate gtfs & gbfs data preprocessing for analytics (#713)
Browse files Browse the repository at this point in the history
  • Loading branch information
cka-y authored Aug 27, 2024
1 parent e1823aa commit 6301db1
Show file tree
Hide file tree
Showing 19 changed files with 1,800 additions and 3 deletions.
30 changes: 30 additions & 0 deletions functions-python/helpers/locations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import Dict

from database_gen.sqlacodegen_models import Feed


def translate_feed_locations(feed: Feed, location_translations: Dict):
"""
Translate the locations of a feed.
:param feed: The feed object
:param location_translations: The location translations
"""
for location in feed.locations:
location_translation = location_translations.get(location.id)

if location_translation:
location.subdivision_name = (
location_translation["subdivision_name_translation"]
if location_translation["subdivision_name_translation"]
else location.subdivision_name
)
location.municipality = (
location_translation["municipality_translation"]
if location_translation["municipality_translation"]
else location.municipality
)
location.country = (
location_translation["country_translation"]
if location_translation["country_translation"]
else location.country
)
4 changes: 2 additions & 2 deletions functions-python/helpers/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ psycopg2-binary==2.9.6
aiohttp
asyncio
urllib3~=2.1.0
SQLAlchemy==1.4.49
geoalchemy2
SQLAlchemy==2.0.23
geoalchemy2==0.14.7
requests~=2.31.0
cloudevents~=1.10.1
requests_mock
91 changes: 91 additions & 0 deletions functions-python/helpers/tests/test_locations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import unittest
from unittest.mock import MagicMock
from database_gen.sqlacodegen_models import Feed, Location
from helpers.locations import translate_feed_locations


class TestTranslateFeedLocations(unittest.TestCase):
def test_translate_feed_locations(self):
# Mock a location object with specific attributes
mock_location = MagicMock(spec=Location)
mock_location.id = 1
mock_location.subdivision_name = "Original Subdivision"
mock_location.municipality = "Original Municipality"
mock_location.country = "Original Country"

# Mock a feed object with locations
mock_feed = MagicMock(spec=Feed)
mock_feed.locations = [mock_location]

# Define a translation dictionary
location_translations = {
1: {
"subdivision_name_translation": "Translated Subdivision",
"municipality_translation": "Translated Municipality",
"country_translation": "Translated Country",
}
}

# Call the translate_feed_locations function
translate_feed_locations(mock_feed, location_translations)

# Assert that the location's attributes were updated with translations
self.assertEqual(mock_location.subdivision_name, "Translated Subdivision")
self.assertEqual(mock_location.municipality, "Translated Municipality")
self.assertEqual(mock_location.country, "Translated Country")

def test_translate_feed_locations_with_missing_translations(self):
# Mock a location object with specific attributes
mock_location = MagicMock(spec=Location)
mock_location.id = 1
mock_location.subdivision_name = "Original Subdivision"
mock_location.municipality = "Original Municipality"
mock_location.country = "Original Country"

# Mock a feed object with locations
mock_feed = MagicMock(spec=Feed)
mock_feed.locations = [mock_location]

# Define a translation dictionary with missing translations
location_translations = {
1: {
"subdivision_name_translation": None,
"municipality_translation": None,
"country_translation": "Translated Country",
}
}

# Call the translate_feed_locations function
translate_feed_locations(mock_feed, location_translations)

# Assert that the location's attributes were updated correctly
self.assertEqual(
mock_location.subdivision_name, "Original Subdivision"
) # No translation
self.assertEqual(
mock_location.municipality, "Original Municipality"
) # No translation
self.assertEqual(mock_location.country, "Translated Country") # Translated

def test_translate_feed_locations_with_no_translation(self):
# Mock a location object with specific attributes
mock_location = MagicMock(spec=Location)
mock_location.id = 1
mock_location.subdivision_name = "Original Subdivision"
mock_location.municipality = "Original Municipality"
mock_location.country = "Original Country"

# Mock a feed object with locations
mock_feed = MagicMock(spec=Feed)
mock_feed.locations = [mock_location]

# Define an empty translation dictionary
location_translations = {}

# Call the translate_feed_locations function
translate_feed_locations(mock_feed, location_translations)

# Assert that the location's attributes remain unchanged
self.assertEqual(mock_location.subdivision_name, "Original Subdivision")
self.assertEqual(mock_location.municipality, "Original Municipality")
self.assertEqual(mock_location.country, "Original Country")
9 changes: 9 additions & 0 deletions functions-python/preprocessed_analytics/.coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[run]
omit =
*/test*/*
*/helpers/*
*/database_gen/*

[report]
exclude_lines =
if __name__ == .__main__.:
197 changes: 197 additions & 0 deletions functions-python/preprocessed_analytics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
# GTFS & GBFS Analytics Processor

This directory contains Google Cloud Functions that automate the retrieval, processing, and analytics generation for GTFS and GBFS datasets. The project is designed to handle and analyze both GTFS and GBFS data, storing the results in Google Cloud Storage.

## Overview

### `process_analytics_gtfs`

This HTTP-triggered Cloud Function processes GTFS datasets by performing the following steps:

1. **Retrieving Data**: Fetches the latest GTFS dataset per feed from the database.
2. **Processing Data**: Analyzes the dataset, extracting metrics related to validation notices, features, and geographical locations.
3. **Storing Analytics**: Saves the processed data as JSON files in the Google Cloud Storage bucket, updating metrics and analytics files.

#### Files Modified/Created:
- **`analytics_YYYY-MM-DD.json`**: Contains the GTFS analytics data for the specific date in JSON format.
**Format:**
```json
{
"feed_id": "string",
"dataset_id": "string",
"notices": {
"errors": ["string"],
"warnings": ["string"],
"infos": ["string"]
},
"features": ["string"],
"created_on": "datetime",
"last_modified": "datetime",
"provider": "string",
"locations": [
{
"country_code": "string",
"country": "string",
"municipality": "string",
"subdivision_name": "string"
}
]
}
```

- **`feed_metrics.json`**: Stores aggregated feed-level metrics, including error, warning, and info counts.
**Format:**
```json
{
"feed_id": "string",
"computed_on": ["datetime"],
"errors_count": ["int"],
"warnings_count": ["int"],
"infos_count": ["int"]
}
```

- **`features_metrics.json`**: Tracks feature usage across feeds, showing the number of feeds using specific features.
**Format:**
```json
{
"feature": "string",
"computed_on": ["datetime"],
"feeds_count": ["int"]
}
```

- **`notices_metrics.json`**: Records notice metrics by severity level (error, warning, info).
**Format:**
```json
{
"notice": "string",
"severity": "string",
"computed_on": ["datetime"],
"feeds_count": ["int"]
}
```

- **`summary/summary_YYYY-MM-DD.json`**: Contains aggregated metrics for the specific date, including feed metrics, feature metrics, and notice metrics.
**Format:**
```json
{
"feed_metrics": [...],
"features_metrics": [...],
"notices_metrics": [...]
}
```

- **`analytics_files.json`**: Index of all `analytics_YYYY-MM-DD.json` files stored in the bucket.
**Format:**
```json
{
"file_name": "string",
"created_on": "datetime"
}
```

### `process_analytics_gbfs`

This HTTP-triggered Cloud Function processes GBFS datasets by performing the following steps:

1. **Retrieving Data**: Fetches the latest GBFS snapshot per feed from the database.
2. **Processing Data**: Analyzes the snapshot, extracting metrics related to validation notices, versions, and geographical locations.
3. **Storing Analytics**: Saves the processed data as JSON files in the Google Cloud Storage bucket, updating metrics and analytics files.

#### Files Modified/Created:
- **`analytics_YYYY-MM-DD.json`**: Contains the GBFS analytics data for the specific date in JSON format.
**Format:**
```json
{
"feed_id": "string",
"snapshot_id": "string",
"notices": [
{
"keyword": "string",
"gbfs_file": "string",
"schema_path": "string"
}
],
"created_on": "datetime",
"operator": "string",
"locations": [
{
"country_code": "string",
"country": "string",
"municipality": "string",
"subdivision_name": "string"
}
]
}
```

- **`feed_metrics.json`**: Stores aggregated feed-level metrics, including error counts.
**Format:**
```json
{
"feed_id": "string",
"computed_on": ["datetime"],
"errors_count": ["int"]
}
```

- **`versions_metrics.json`**: Tracks the usage of different GBFS versions across feeds.
**Format:**
```json
{
"version": "string",
"computed_on": ["datetime"],
"feeds_count": ["int"]
}
```

- **`notices_metrics.json`**: Records notice metrics specific to GBFS, categorized by keyword, file, and schema path.
**Format:**
```json
{
"keyword": "string",
"gbfs_file": "string",
"schema_path": "string",
"computed_on": ["datetime"],
"feeds_count": ["int"]
}
```

- **`summary/summary_YYYY-MM-DD.json`**: Contains aggregated metrics for the specific date, including feed metrics, version metrics, and notice metrics.
**Format:**
```json
{
"feed_metrics": [...],
"versions_metrics": [...],
"notices_metrics": [...]
}
```

- **`analytics_files.json`**: Index of all `analytics_YYYY-MM-DD.json` files stored in the bucket.
**Format:**
```json
{
"file_name": "string",
"created_on": "datetime"
}
```

## Project Structure

- **`main.py`**: Defines the HTTP-triggered Cloud Functions that initiate the GTFS and GBFS data analytics processes.
- **`processors/base_analytics_processor.py`**: Contains the base class for analytics processing, providing common logic for GTFS and GBFS processors.
- **`processors/gtfs_analytics_processor.py`**: Implements GTFS-specific data retrieval and processing logic.
- **`processors/gbfs_analytics_processor.py`**: Implements GBFS-specific data retrieval and processing logic.
- **`tests/`**: Unit tests for all modules and functions, ensuring correct functionality and robustness.

## Project Configuration

The following environment variables need to be set:

- `FEEDS_DATABASE_URL`: The URL for the database containing GTFS and GBFS feeds.
- `ANALYTICS_BUCKET`: The name of the Google Cloud Storage bucket where analytics results are stored.

## Local Development

Refer to the main [README.md](../README.md) for general setup instructions for the development environment.
20 changes: 20 additions & 0 deletions functions-python/preprocessed_analytics/function_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "preprocess-analytics",
"description": "Preprocess analytics",
"entry_point": "preprocess_analytics",
"timeout": 540,
"memory": "2Gi",
"trigger_http": false,
"include_folders": ["database_gen", "helpers"],
"environment_variables": [],
"secret_environment_variables": [
{
"key": "FEEDS_DATABASE_URL"
}
],
"ingress_settings": "ALLOW_ALL",
"max_instance_request_concurrency": 1,
"max_instance_count": 5,
"min_instance_count": 0,
"available_cpu": 1
}
15 changes: 15 additions & 0 deletions functions-python/preprocessed_analytics/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
functions-framework==3.*
google-cloud-logging
google-cloud-bigquery
google-cloud-storage
psycopg2-binary==2.9.6
aiohttp~=3.8.6
asyncio~=3.4.3
urllib3~=2.1.0
SQLAlchemy==2.0.23
geoalchemy2==0.14.7
requests~=2.31.0
attrs~=23.1.0
pluggy~=1.3.0
certifi~=2023.7.22
pandas
4 changes: 4 additions & 0 deletions functions-python/preprocessed_analytics/requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Faker
pytest~=7.4.3
urllib3-mock
requests-mock
Empty file.
Loading

0 comments on commit 6301db1

Please sign in to comment.