Skip to content

Commit

Permalink
feat: Add our own DNS to GCP bucket links (#273)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidgamez authored Feb 13, 2024
1 parent a881dc1 commit 34288de
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 6 deletions.
16 changes: 11 additions & 5 deletions functions-python/batch_process_dataset/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(
bucket_name,
authentication_type,
api_key_parameter_name,
public_hosted_datasets_url,
):
self.producer_url = producer_url
self.bucket_name = bucket_name
Expand All @@ -72,6 +73,7 @@ def __init__(
self.date = datetime.now().strftime("%Y%m%d%H%S")
feeds_credentials = ast.literal_eval(os.getenv("FEED_CREDENTIALS", "{}"))
self.feed_credentials = feeds_credentials.get(self.feed_stable_id, None)
self.public_hosted_datasets_url = public_hosted_datasets_url

self.init_status = None
self.init_status_additional_data = None
Expand Down Expand Up @@ -137,20 +139,22 @@ def upload_dataset(self) -> DatasetFile or None:
dataset_stable_id = self.create_dataset_stable_id(
self.feed_stable_id, self.date
)

dataset_full_path = (
f"{self.feed_stable_id}/{dataset_stable_id}/{dataset_stable_id}.zip"
)
logging.info(
f"Creating file: {self.feed_stable_id}/{dataset_stable_id}/{dataset_stable_id}.zip"
f"Creating file: {dataset_full_path}"
f" in bucket {self.bucket_name}"
)
timestamp_blob = self.upload_file_to_storage(
self.upload_file_to_storage(
temp_file_path,
f"{self.feed_stable_id}/{dataset_stable_id}/{dataset_stable_id}.zip",
f"{dataset_full_path}",
)

return DatasetFile(
stable_id=dataset_stable_id,
file_sha256_hash=file_sha256_hash,
hosted_url=timestamp_blob.public_url,
hosted_url=f"{self.public_hosted_datasets_url}/{dataset_full_path}",
)

logging.info(
Expand Down Expand Up @@ -282,6 +286,7 @@ def process_dataset(cloud_event: CloudEvent):
bucket_name = os.getenv("DATASETS_BUCKET_NANE")
start_db_session(os.getenv("FEEDS_DATABASE_URL"))
maximum_executions = os.getenv("MAXIMUM_EXECUTIONS", 1)
public_hosted_datasets_url = os.getenv("PUBLIC_HOSTED_DATASETS_URL")
trace_service = None
dataset_file: DatasetFile = None
error_message = None
Expand Down Expand Up @@ -319,6 +324,7 @@ def process_dataset(cloud_event: CloudEvent):
bucket_name,
int(json_payload["authentication_type"]),
json_payload["api_key_parameter_name"],
public_hosted_datasets_url,
)
dataset_file = processor.process()
except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
)
file_content: Final[bytes] = b"Test content"
file_hash: Final[str] = sha256(file_content).hexdigest()
test_hosted_public_url = "https://the-no-existent-url.com"


def create_cloud_event(mock_data):
Expand Down Expand Up @@ -49,6 +50,7 @@ def test_upload_dataset_diff_hash(
"""
mock_blob = MagicMock()
mock_blob.public_url = public_url
mock_blob.path = public_url
upload_file_to_storage.return_value = mock_blob
mock_download_url_content.return_value = file_hash

Expand All @@ -61,14 +63,19 @@ def test_upload_dataset_diff_hash(
"bucket_name",
0,
None,
test_hosted_public_url,
)
with patch.object(processor, "date", "mocked_timestamp"):
result = processor.upload_dataset()

self.assertIsNotNone(result)
mock_download_url_content.assert_called_once()
self.assertIsInstance(result, DatasetFile)
self.assertEqual(result.hosted_url, public_url)
self.assertEqual(
result.hosted_url,
f"{test_hosted_public_url}/feed_stable_id/feed_stable_id-mocked_timestamp"
f"/feed_stable_id-mocked_timestamp.zip",
)
self.assertEqual(result.file_sha256_hash, file_hash)
# Upload to storage is called twice, one for the latest and one for the timestamped one
self.assertEqual(upload_file_to_storage.call_count, 2)
Expand All @@ -95,6 +102,7 @@ def test_upload_dataset_same_hash(
"bucket_name",
0,
None,
test_hosted_public_url,
)

result = processor.upload_dataset()
Expand Down Expand Up @@ -126,6 +134,7 @@ def test_upload_dataset_download_exception(
"bucket_name",
0,
None,
test_hosted_public_url,
)

with self.assertRaises(Exception):
Expand Down Expand Up @@ -158,6 +167,7 @@ def test_upload_file_to_storage(self):
bucket_name,
0,
None,
test_hosted_public_url,
)
result = processor.upload_file_to_storage(source_file_path, target_path)

Expand Down Expand Up @@ -192,6 +202,7 @@ def test_process(self):
bucket_name,
authentication_type,
api_key_parameter_name,
test_hosted_public_url,
)

processor.upload_dataset = MagicMock(
Expand Down Expand Up @@ -228,6 +239,7 @@ def test_process_no_change(self):
bucket_name,
authentication_type,
api_key_parameter_name,
test_hosted_public_url,
)

processor.upload_dataset = MagicMock(return_value=None)
Expand Down
60 changes: 60 additions & 0 deletions infra/batch/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ locals {
# DEV and QA use the vpc connector
vpc_connector_name = lower(var.environment) == "dev" ? "vpc-connector-qa" : "vpc-connector-${lower(var.environment)}"
vpc_connector_project = lower(var.environment) == "dev" ? "mobility-feeds-qa" : var.project_id
# Files DNS name
public_hosted_datasets_url = lower(var.environment) == "prod" ? "https://${var.public_hosted_datasets_dns}" : "https://${var.environment}-${var.public_hosted_datasets_dns}"
}

data "google_vpc_access_connector" "vpc_connector" {
Expand All @@ -39,6 +41,12 @@ data "google_vpc_access_connector" "vpc_connector" {
project = local.vpc_connector_project
}

# This resource maps an already created SSL certificate to a terraform state resource.
# The SSL setup is done outside terraform for security reasons.
data "google_compute_ssl_certificate" "files_ssl_cert" {
name = "files-${var.environment}-mobilitydatabase"
}

resource "google_project_service" "services" {
for_each = toset(local.services)
service = each.value
Expand Down Expand Up @@ -261,6 +269,7 @@ resource "google_cloudfunctions2_function" "pubsub_function" {
PYTHONNODEBUGRANGES = 0
DB_REUSE_SESSION = "True"
ENVIRONMENT = var.environment
PUBLIC_HOSTED_DATASETS_URL = local.public_hosted_datasets_url
}
dynamic "secret_environment_variables" {
for_each = local.function_batch_process_dataset_config.secret_environment_variables
Expand Down Expand Up @@ -321,3 +330,54 @@ resource "google_cloud_scheduler_job" "job" {
}
}
}

resource "google_compute_backend_bucket" "files_backend" {
name = "datasets-backend-${var.environment}"
bucket_name = google_storage_bucket.datasets_bucket.name
enable_cdn = false
}

resource "google_compute_url_map" "files_url_map" {
name = "files-url-map-${var.environment}"
default_service = google_compute_backend_bucket.files_backend.id
host_rule {
hosts = ["*"]
path_matcher = "allpaths"

}

path_matcher {
name = "allpaths"
default_service = google_compute_backend_bucket.files_backend.id
}
}

resource "google_compute_target_https_proxy" "files_https_proxy" {
name = "files-proxy-${var.environment}"
url_map = google_compute_url_map.files_url_map.id
ssl_certificates = [data.google_compute_ssl_certificate.files_ssl_cert.id]
}

data "google_compute_global_address" "files_http_lb_ipv4" {
name = "files-http-lb-ipv4-static-${var.environment}"
}

data "google_compute_global_address" "files_http_lb_ipv6" {
name = "files-http-lb-ipv6-static-${var.environment}"
}

resource "google_compute_global_forwarding_rule" "files_http_lb_rule" {
name = "files-http-lb-rule-${var.environment}"
target = google_compute_target_https_proxy.files_https_proxy.self_link
port_range = "443"
ip_address = data.google_compute_global_address.files_http_lb_ipv6.address
load_balancing_scheme = "EXTERNAL_MANAGED"
}

resource "google_compute_global_forwarding_rule" "files_http_lb_rule_ipv4" {
name = "files-http-lb-rule-v4-${var.environment}"
target = google_compute_target_https_proxy.files_https_proxy.self_link
port_range = "443"
ip_address = data.google_compute_global_address.files_http_lb_ipv4.address
load_balancing_scheme = "EXTERNAL_MANAGED"
}
6 changes: 6 additions & 0 deletions infra/batch/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,10 @@ variable "python_runtime" {
type = string
description = "Python runtime version"
default = "python310"
}

variable "public_hosted_datasets_dns" {
type = string
description = "Public hosted DNS for datasets"
default = "files.mobilitydatabase.com"
}

0 comments on commit 34288de

Please sign in to comment.