From f77609fe9d008fe391e1914dfca1ac225831591c Mon Sep 17 00:00:00 2001 From: "kai [they]" Date: Tue, 19 Nov 2024 10:57:37 -0800 Subject: [PATCH 1/4] [Issue #2475] Scale staging to the same numbers as prod (#2929) ## Context Relates to #2475 ## Motivation I would like to do the load test in staging, a) because it should be more stable than dev and b) because I don't want to mess up our prod analytics --- infra/api/app-config/staging.tf | 38 +++++++++++++++++----------- infra/frontend/app-config/staging.tf | 23 ++++++++++------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/infra/api/app-config/staging.tf b/infra/api/app-config/staging.tf index d670ee0cb..495056d3d 100644 --- a/infra/api/app-config/staging.tf +++ b/infra/api/app-config/staging.tf @@ -7,23 +7,31 @@ module "staging_config" { database_enable_http_endpoint = true has_incident_management_service = local.has_incident_management_service - # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-auto-scaling.html - # https://us-east-1.console.aws.amazon.com/ecs/v2/clusters/api-staging/services/api-staging/health?region=us-east-1 - # instance_desired_instance_count and instance_scaling_min_capacity are scaled for the average CPU and Memory - # seen over 12 months, as of November 2024 exlucing an outlier range around February 2024. - # With a minimum of 2, so CPU doesn't spike to infinity on deploys. + # # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-auto-scaling.html + # # https://us-east-1.console.aws.amazon.com/ecs/v2/clusters/api-staging/services/api-staging/health?region=us-east-1 + # # instance_desired_instance_count and instance_scaling_min_capacity are scaled for the average CPU and Memory + # # seen over 12 months, as of November 2024 exlucing an outlier range around February 2024. + # # With a minimum of 2, so CPU doesn't spike to infinity on deploys. + # instance_desired_instance_count = 2 + # instance_scaling_min_capacity = 2 + # # instance_scaling_max_capacity is 5x the instance_scaling_min_capacity + # instance_scaling_max_capacity = 10 + + # # https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/aurora-serverless-v2.setting-capacity.html + # # https://us-east-1.console.aws.amazon.com/rds/home?region=us-east-1#database:id=api-dev;is-cluster=true;tab=monitoring + # # database_min_capacity is average api-staging ServerlessDatabaseCapacity seen over 12 months, as of November 2024 + # database_min_capacity = 2 + # # database_max_capacity is 5x the database_min_capacity + # database_max_capacity = 10 + # database_instance_count = 2 + + # Temporarily scale staging to the same numbers as prod for the load test instance_desired_instance_count = 2 instance_scaling_min_capacity = 2 - # instance_scaling_max_capacity is 5x the instance_scaling_min_capacity - instance_scaling_max_capacity = 10 - - # https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/aurora-serverless-v2.setting-capacity.html - # https://us-east-1.console.aws.amazon.com/rds/home?region=us-east-1#database:id=api-dev;is-cluster=true;tab=monitoring - # database_min_capacity is average api-staging ServerlessDatabaseCapacity seen over 12 months, as of November 2024 - database_min_capacity = 2 - # database_max_capacity is 5x the database_min_capacity - database_max_capacity = 10 - database_instance_count = 2 + instance_scaling_max_capacity = 10 + database_min_capacity = 20 + database_max_capacity = 128 + database_instance_count = 2 has_search = true # https://docs.aws.amazon.com/opensearch-service/latest/developerguide/what-is.html#choosing-version diff --git a/infra/frontend/app-config/staging.tf b/infra/frontend/app-config/staging.tf index dd3a343e1..3aade1592 100644 --- a/infra/frontend/app-config/staging.tf +++ b/infra/frontend/app-config/staging.tf @@ -6,13 +6,18 @@ module "staging_config" { has_database = local.has_database has_incident_management_service = local.has_incident_management_service - # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-auto-scaling.html - # https://us-east-1.console.aws.amazon.com/ecs/v2/clusters/frontend-dev/services/frontend-dev/health?region=us-east-1 - # instance_desired_instance_count and instance_scaling_min_capacity are scaled for the average CPU and Memory - # seen over 12 months, as of November 2024 exlucing an outlier range around February 2024. - # With a minimum of 2, so CPU doesn't spike to infinity on deploys. - instance_desired_instance_count = 2 - instance_scaling_min_capacity = 2 - # instance_scaling_max_capacity is 5x the instance_scaling_min_capacity - instance_scaling_max_capacity = 10 + # # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-auto-scaling.html + # # https://us-east-1.console.aws.amazon.com/ecs/v2/clusters/frontend-dev/services/frontend-dev/health?region=us-east-1 + # # instance_desired_instance_count and instance_scaling_min_capacity are scaled for the average CPU and Memory + # # seen over 12 months, as of November 2024 exlucing an outlier range around February 2024. + # # With a minimum of 2, so CPU doesn't spike to infinity on deploys. + # instance_desired_instance_count = 2 + # instance_scaling_min_capacity = 2 + # # instance_scaling_max_capacity is 5x the instance_scaling_min_capacity + # instance_scaling_max_capacity = 10 + + # Temporarily scale staging to the same numbers as prod for the load test + instance_desired_instance_count = 4 + instance_scaling_min_capacity = 4 + instance_scaling_max_capacity = 20 } From 7ea147130507b60f0b5de1bd619891a2cc440ed8 Mon Sep 17 00:00:00 2001 From: Aaron Couch Date: Tue, 19 Nov 2024 14:30:10 -0500 Subject: [PATCH 2/4] [Issue #2474] Update artillery data (#2928) ## Summary Fixes #2474 ### Time to review: __5 mins__ ## Changes proposed Adds data for artillery. Opp ids for dev/stage and prod were grabbed from CSV exports. The prod was winnowed by randomly selecting 20,000 opp ids. Agency data was grabbed by queries for the local, dev, prod dbs. --- DEVELOPMENT.md | 5 +- frontend/tests/artillery/params.json | 168 -------------------------- frontend/tests/artillery/processor.ts | 18 ++- 3 files changed, 16 insertions(+), 175 deletions(-) delete mode 100644 frontend/tests/artillery/params.json diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index d5e46970b..cf82402f5 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -115,8 +115,9 @@ Any changes to features, tools, or workflows should include updates or additions To run the load test: 1. Install artillery locally if you haven't done so with `npm install -g artillery@latest` -2. `$ cd api` or or `$ cd frontend` -3. `$ make load-test-` where env is either `local`, `dev`, `staging`, or `production` +2. For the frontend, download the required data from https://drive.google.com/file/d/1zknvVSRqL7xs8VGinztuKelfppYlgRoP and save "params.json" to `frontend/tests/artillery/params.json` +3. `$ cd api` or or `$ cd frontend` +4. `$ make load-test-` where env is either `local`, `dev`, `staging`, or `production` - `make load-test-local` - requires running a local container in another console diff --git a/frontend/tests/artillery/params.json b/frontend/tests/artillery/params.json deleted file mode 100644 index ecb30e494..000000000 --- a/frontend/tests/artillery/params.json +++ /dev/null @@ -1,168 +0,0 @@ -{ - "ids": { - "local": [ - 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 - ], - "dev": [], - "prod": [] - }, - "queries": [ - "test", - "grants", - "education", - "transportation", - "trauma", - "veterans" - ], - "status": ["posted", "forecasted", "closed", "archived", "none"], - "agencies": [ - "ARPAH", - "USAID", - "USAID-AFG", - "USAID", - "USAID-ARM", - "USAID-AZE", - "USAID-BAN", - "USAID-BEN", - "AC", - "DC", - "USDA", - "USDA-AMS", - "USDA-FNS1", - "DOC", - "DOC-DOCNOAAERA", - "DOC-EDA", - "DOC-NIST", - "DOD", - "DOD-AMC-ACCAPGN", - "DOD-AMC-ACCAPGD", - "DOD-AFRL-AFRLDET8", - "DOD-AFRL", - "DOD-USAFA", - "DOD-AFOSR", - "DOD-DARPA-BTO", - "ED", - "DOE", - "DOE-ARPAE", - "DOE-GFO", - "DOE-01", - "PAMS", - "PAMS-SC", - "HHS", - "HHS-ACF-FYSB", - "HHS-ACF", - "HHS-ACF-CB", - "DHS", - "DHS-DHS", - "DHS-OPO", - "DHS-USCG", - "HUD", - "USDOJ", - "USDOJ-OJP-BJA", - "USDOJ-OJP-COPS", - "DOL", - "DOL-ETA-ILAB", - "DOL-ETA-CEO", - "DOS", - "DOS-NEA-AC", - "DOS-DRL", - "DOS-ECA", - "DOI", - "DOI-BIA", - "DOI-BLM", - "DOI-BOR", - "USDOT", - "USDOT-ORP", - "USDOT-DO-SIPPRA", - "USDOT-GCR", - "DOT", - "DOT-DOT X-50", - "DOT-RITA", - "DOT-FAA-FAA ARG", - "DOT-FRA", - "DOT-FHWA", - "DOT-FTA", - "DOT-FAA-FAA COE-AJFE", - "DOT-FAA-FAA COE-FAA JAMS", - "DOT-FAA-FAA COE-TTHP", - "DOT-MA", - "DOT-NHTSA", - "VA", - "VA-CSHF", - "VA-HPGPDP", - "VA-LSV", - "VA-NVSP", - "VA-NCAC", - "VA-OMHSP", - "VA-SSVF", - "VA-NCA", - "VA-VLGP", - "EPA", - "IMLS", - "MCC", - "NASA", - "NASA-HQ", - "NASA-JSC", - "NASA-SFC", - "NASA", - "NARA", - "NEA", - "NEH", - "NSF", - "SSA" - ], - "eligibility": [ - "state_governments", - "county_governments", - "city_or_township_governments", - "special_district_governments", - "independent_school_districts", - "public_and_state_institutions_of_higher_education", - "private_institutions_of_higher_education", - "federally_recognized_native_american_tribal_governments", - "other_native_american_tribal_organizations", - "public_and_indian_housing_authorities", - "nonprofits_non_higher_education_with_501c3", - "nonprofits_non_higher_education_without_501c3", - "for_profit_organizations_other_than_small_businesses", - "small_businesses", - "other", - "unrestricted" - ], - "funding": [ - "cooperative_agreement", - "grant", - "procurement_contract", - "other" - ], - "category": [ - "recovery_act", - "agriculture", - "arts", - "business_and_commerce", - "community_development", - "consumer_protection", - "disaster_prevention_and_relief", - "education", - "employment_labor_and_training", - "energy", - "environment", - "food_and_nutrition", - "health", - "housing", - "humanities", - "information_and_statistics", - "infrastructure_investment_and_jobs_act", - "income_security_and_social_services", - "law_justice_and_legal_services", - "natural_resources", - "opportunity_zone_benefits", - "regional_development", - "science_technology_and_other_research_and_development", - "transportation", - "affordable_care_act", - "other" - ], - "pages": ["", "process", "research", "subscribe", "subscribe/confirmation"] -} diff --git a/frontend/tests/artillery/processor.ts b/frontend/tests/artillery/processor.ts index a6a2681c2..3ff159246 100644 --- a/frontend/tests/artillery/processor.ts +++ b/frontend/tests/artillery/processor.ts @@ -8,13 +8,16 @@ type dataType = { queries: Array; pages: Array; status: Array; - agencies: Array; + agencies: { + [key: string]: Array; + }; funding: Array; eligibility: Array; category: Array; }; type globalVars = { $environment?: string; + env: string; }; type returnVars = { @@ -26,7 +29,7 @@ type returnVars = { // eslint-disable-next-line @typescript-eslint/require-await async function getOppId(context: { vars: dataType & returnVars & globalVars }) { - const env = context.vars.$environment as string; + const env = context.vars.env; context.vars.id = context.vars.ids[env][random(context.vars.ids[env].length - 1)]; } @@ -49,11 +52,14 @@ async function getStatic(context: { vars: returnVars }) { } // eslint-disable-next-line @typescript-eslint/require-await -async function getSearchQuery(context: { vars: returnVars & dataType }) { +async function getSearchQuery(context: { + vars: returnVars & dataType & globalVars; +}) { + const env = context.vars.env; const { queries, status, agencies, eligibility, category } = context.vars; const queryParam = `query=${queries[random(queries.length - 1)]}`; const statusParam = `status=${status[random(status.length - 1)]}`; - const agencyParam = `agency=${agencies[random(agencies.length - 1)]}`; + const agencyParam = `agency=${agencies[env][random(agencies[env].length - 1)]}`; const categoryParam = `category=${category[random(category.length - 1)]}`; const eligibilityParam = `eligibility=${eligibility[random(eligibility.length - 1)]}`; const pagerParam = `page=${random(5)}`; @@ -100,13 +106,15 @@ async function loadData(context: { vars: dataType & globalVars }) { // Dev and stage have the same data. const env = context.vars.$environment === "stage" ? "dev" : context.vars.$environment; + const envs = new Set(["local", "dev", "stage", "prod"]); if (!env || !envs.has(env)) { throw new Error(`env ${env ?? ""} does not exist in env list`); } const path = "./tests/artillery/params.json"; const file = await readFile(path, "utf8"); - context.vars = JSON.parse(file) as dataType; + context.vars = JSON.parse(file) as dataType & globalVars; + context.vars.env = env; } function randomString(length: number) { From c8e2006c9e6ce917a40c49ef3efc2bf8ed0d38ee Mon Sep 17 00:00:00 2001 From: David Dudas Date: Tue, 19 Nov 2024 14:21:30 -0800 Subject: [PATCH 3/4] [Issue 2869] Add versioning to analytics db schema (#2870) ## Summary Fixes #2869 ### Time to review: __5 mins__ ## Changes proposed > What was added, updated, or removed in this PR. 1. Added a simple versioning pattern to enable schema extensibility, including a table in the schema to keep track of current schema version 2. Refactored `make init-db` command to enable execution of multiple SQL files, and update schema version accordingly upon the execution of each file ## Context for reviewers > Testing instructions, background context, more in-depth details of the implementation, and anything else you'd like to call out or ask reviewers. Explain how the changes were verified. This is part of an effort to enable automated deployment of analytics db to staging and prod. ## Additional information > Screenshots, GIF demos, code examples or output to help show the changes working as expected. --- .github/workflows/ci-analytics.yml | 6 +- analytics/src/analytics/cli.py | 4 +- .../analytics/integrations/etldb/__init__.py | 8 +- .../src/analytics/integrations/etldb/etldb.py | 84 ++++++++++- .../src/analytics/integrations/etldb/main.py | 139 ++++++++++++------ .../versions/0001_create_tables_etldb.sql} | 0 .../versions/0002_create_schema_versions.sql | 7 + ...003_alter_tables_set_default_timestamp.sql | 9 ++ analytics/tests/integrations/test_etldb.py | 93 ++++++++++++ 9 files changed, 299 insertions(+), 51 deletions(-) rename analytics/src/analytics/integrations/etldb/{create_etl_db.sql => migrations/versions/0001_create_tables_etldb.sql} (100%) create mode 100644 analytics/src/analytics/integrations/etldb/migrations/versions/0002_create_schema_versions.sql create mode 100644 analytics/src/analytics/integrations/etldb/migrations/versions/0003_alter_tables_set_default_timestamp.sql create mode 100644 analytics/tests/integrations/test_etldb.py diff --git a/.github/workflows/ci-analytics.yml b/.github/workflows/ci-analytics.yml index 6820e657c..6256cc074 100644 --- a/.github/workflows/ci-analytics.yml +++ b/.github/workflows/ci-analytics.yml @@ -39,12 +39,12 @@ jobs: - name: Run linting run: make lint + - name: Run database initialization + run: docker compose down --volumes && make init-db + - name: Run tests run: make test-audit - - name: Run database initialization - run: make init-db - # Both of these tasks are looking for github and slack auth # - name: Export GitHub data # run: make gh-data-export diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py index 491b585b2..44ba05c5f 100644 --- a/analytics/src/analytics/cli.py +++ b/analytics/src/analytics/cli.py @@ -269,7 +269,7 @@ def export_json_to_database(delivery_file: Annotated[str, ISSUE_FILE_ARG]) -> No def initialize_database() -> None: """Initialize etl database.""" logger.info("initializing database") - etldb.init_db() + etldb.initialize_database() logger.info("done") @@ -296,7 +296,7 @@ def transform_and_load( dataset = EtlDataset.load_from_json_file(file_path=issue_file) # sync data to db - etldb.sync_db(dataset, datestamp) + etldb.sync_data(dataset, datestamp) # finish print("transform and load is done") diff --git a/analytics/src/analytics/integrations/etldb/__init__.py b/analytics/src/analytics/integrations/etldb/__init__.py index c1afd0946..3608faff7 100644 --- a/analytics/src/analytics/integrations/etldb/__init__.py +++ b/analytics/src/analytics/integrations/etldb/__init__.py @@ -1,11 +1,11 @@ """Read and write data from/to delivery metrics database.""" __all__ = [ - "init_db", - "sync_db", + "initialize_database", + "sync_data", ] from analytics.integrations.etldb.main import ( - init_db, - sync_db, + initialize_database, + sync_data, ) diff --git a/analytics/src/analytics/integrations/etldb/etldb.py b/analytics/src/analytics/integrations/etldb/etldb.py index 5d9ca2671..63ffa18ed 100644 --- a/analytics/src/analytics/integrations/etldb/etldb.py +++ b/analytics/src/analytics/integrations/etldb/etldb.py @@ -2,7 +2,7 @@ from enum import Enum -from sqlalchemy import Connection +from sqlalchemy import Connection, text from analytics.integrations.db import PostgresDbClient @@ -36,6 +36,88 @@ def commit(self, connection: Connection) -> None: """Commit an open transaction.""" connection.commit() + def get_schema_version(self) -> int: + """Select schema version from etl database.""" + version = 0 + + if self.schema_versioning_exists(): + result = self.connection().execute( + text("select version from schema_version"), + ) + row = result.fetchone() + if row: + version = row[0] + + return version + + def set_schema_version(self, new_value: int) -> bool: + """Set schema version number.""" + if not self.schema_versioning_exists(): + return False + + # sanity check new version number + current_version = self.get_schema_version() + if new_value < current_version: + message = ( + "WARNING: cannot bump schema version " + f"from {current_version} to {new_value}" + ) + print(message) + return False + + if new_value > current_version: + cursor = self.connection() + cursor.execute( + text( + "insert into schema_version (version) values (:new_value) " + "on conflict(one_row) do update " + "set version = :new_value", + ), + {"new_value": new_value}, + ) + self.commit(cursor) + return True + + return False + + def revert_to_schema_version(self, new_value: int) -> bool: + """Revert schema version number to the previous version.""" + if not self.schema_versioning_exists(): + return False + + # sanity check new version number + current_version = self.get_schema_version() + if new_value != current_version - 1 or new_value < 0: + message = ( + "WARNING: cannot bump schema version " + f"from {current_version} to {new_value}" + ) + print(message) + return False + + cursor = self.connection() + cursor.execute( + text( + "insert into schema_version (version) values (:new_value) " + "on conflict(one_row) do update " + "set version = :new_value", + ), + {"new_value": new_value}, + ) + self.commit(cursor) + return True + + def schema_versioning_exists(self) -> bool: + """Determine whether schema version table exists.""" + result = self.connection().execute( + text( + "select table_name from information_schema.tables " + "where table_name = 'schema_version'", + ), + ) + row = result.fetchone() + return bool(row and row[0] == "schema_version") + class EtlChangeType(Enum): """An enum to describe ETL change types.""" diff --git a/analytics/src/analytics/integrations/etldb/main.py b/analytics/src/analytics/integrations/etldb/main.py index 3f300042d..81cc549b4 100644 --- a/analytics/src/analytics/integrations/etldb/main.py +++ b/analytics/src/analytics/integrations/etldb/main.py @@ -1,5 +1,7 @@ """Integrate with database to read and write etl data.""" +import os +import re from pathlib import Path from psycopg.errors import InsufficientPrivilege @@ -17,35 +19,55 @@ VERBOSE = False -def init_db() -> None: - """Initialize etl database.""" - # define the path to the sql file - parent_path = Path(__file__).resolve().parent - sql_path = f"{parent_path}/create_etl_db.sql" - - # read sql file - with open(sql_path) as f: - sql = f.read() +def initialize_database() -> None: + """ + Create and/or update an etl database by applying a sequential set of migration scripts. + + It applies the migrations using the following steps: + - Check the current schema version listed in the database + - Retrieve the list of migration scripts ordered by version + - If the current schema version is less than the version of the latest migration script + run the remaining migrations in order + - Bump the schema version in the database to the latest version + - If the current schema version matches the latest script, do nothing + """ + # get connection to database + etldb = EtlDb() + current_version = etldb.get_schema_version() + print(f"current schema version: {current_version}") + + # get all sql file paths and respective version numbers + sql_file_path_map = get_sql_file_paths() + all_versions = sorted(sql_file_path_map.keys()) + + # iterate sql files + migration_count = 0 + for next_version in all_versions: + if next_version <= current_version: + continue + # read sql file + with open(sql_file_path_map[next_version]) as f: + sql = f.read() + # execute sql + print(f"applying migration for schema version: {next_version}") + print(f"migration source file: {sql_file_path_map[next_version]}") + cursor = etldb.connection() + cursor.execute( + text(sql), + ) + # commit changes + etldb.commit(cursor) + # bump schema version number + _ = etldb.set_schema_version(next_version) + current_version = next_version + migration_count += 1 - # execute sql - try: - db = EtlDb() - cursor = db.connection() - cursor.execute( - text(sql), - ) - db.commit(cursor) - except ( - RuntimeError, - ProgrammingError, - OperationalError, - InsufficientPrivilege, - ) as e: - message = f"FATAL: Failed to initialize db: {e}" - raise RuntimeError(message) from e + # summarize results in output + print(f"total migrations applied: {migration_count}") + print(f"new schema version: {current_version}") -def sync_db(dataset: EtlDataset, effective: str) -> None: +def sync_data(dataset: EtlDataset, effective: str) -> None: """Write github data to etl database.""" # initialize a map of github id to db row id ghid_map: dict[EtlEntityType, dict[str, int]] = { @@ -63,10 +85,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None: ghid_map[EtlEntityType.QUAD] = sync_quads(db, dataset) print(f"quad row(s) processed: {len(ghid_map[EtlEntityType.QUAD])}") except ( - RuntimeError, - ProgrammingError, - OperationalError, InsufficientPrivilege, + OperationalError, + ProgrammingError, + RuntimeError, ) as e: message = f"FATAL: Failed to sync quad data: {e}" raise RuntimeError(message) from e @@ -82,10 +104,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None: f"deliverable row(s) processed: {len(ghid_map[EtlEntityType.DELIVERABLE])}", ) except ( - RuntimeError, - ProgrammingError, - OperationalError, InsufficientPrivilege, + OperationalError, + ProgrammingError, + RuntimeError, ) as e: message = f"FATAL: Failed to sync deliverable data: {e}" raise RuntimeError(message) from e @@ -95,10 +117,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None: ghid_map[EtlEntityType.SPRINT] = sync_sprints(db, dataset, ghid_map) print(f"sprint row(s) processed: {len(ghid_map[EtlEntityType.SPRINT])}") except ( - RuntimeError, - ProgrammingError, - OperationalError, InsufficientPrivilege, + OperationalError, + ProgrammingError, + RuntimeError, ) as e: message = f"FATAL: Failed to sync sprint data: {e}" raise RuntimeError(message) from e @@ -108,10 +130,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None: ghid_map[EtlEntityType.EPIC] = sync_epics(db, dataset, ghid_map) print(f"epic row(s) processed: {len(ghid_map[EtlEntityType.EPIC])}") except ( - RuntimeError, - ProgrammingError, - OperationalError, InsufficientPrivilege, + OperationalError, + ProgrammingError, + RuntimeError, ) as e: message = f"FATAL: Failed to sync epic data: {e}" raise RuntimeError(message) from e @@ -121,10 +143,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None: issue_map = sync_issues(db, dataset, ghid_map) print(f"issue row(s) processed: {len(issue_map)}") except ( - RuntimeError, - ProgrammingError, - OperationalError, InsufficientPrivilege, + OperationalError, + ProgrammingError, + RuntimeError, ) as e: message = f"FATAL: Failed to sync issue data: {e}" raise RuntimeError(message) from e @@ -190,3 +212,38 @@ def sync_quads(db: EtlDb, dataset: EtlDataset) -> dict: f"QUAD '{ghid}' title = '{quad_df['quad_name']}', row_id = {result[ghid]}", ) return result + + +def get_sql_file_paths() -> dict[int, str]: + """Get all sql files needed for database initialization.""" + result = {} + + # define the path to the sql files + sql_file_directory = f"{Path(__file__).resolve().parent}/migrations/versions" + + # get list of sorted filenames + filename_list = sorted(os.listdir(sql_file_directory)) + + # expected filename format: {4_digit_version_number}_{short_description}.sql + # example: 0003_alter_tables_set_default_timestamp.sql + pattern = re.compile(r"^\d\d\d\d_.+\.sql$") + + # compile dict of results + for filename in filename_list: + # validate filename format + if not pattern.match(filename): + message = f"FATAL: malformed db migration filename: {filename}" + raise RuntimeError(message) + + # extrace version number from filename + version = int(filename[:4]) + + # do not allow duplicate version number + if version in result: + message = f"FATAL: Duplicate db migration version number: {version} " + raise RuntimeError(message) + + # map the version number to the file path + result[version] = f"{sql_file_directory}/{filename}" + + return result diff --git a/analytics/src/analytics/integrations/etldb/create_etl_db.sql b/analytics/src/analytics/integrations/etldb/migrations/versions/0001_create_tables_etldb.sql similarity index 100% rename from analytics/src/analytics/integrations/etldb/create_etl_db.sql rename to analytics/src/analytics/integrations/etldb/migrations/versions/0001_create_tables_etldb.sql diff --git a/analytics/src/analytics/integrations/etldb/migrations/versions/0002_create_schema_versions.sql b/analytics/src/analytics/integrations/etldb/migrations/versions/0002_create_schema_versions.sql new file mode 100644 index 000000000..31d1d0eaa --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/migrations/versions/0002_create_schema_versions.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS schema_version ( + one_row BOOL GENERATED ALWAYS AS (TRUE) STORED, + version INTEGER NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); +CREATE UNIQUE INDEX IF NOT EXISTS sv_i1 ON schema_version(one_row); diff --git a/analytics/src/analytics/integrations/etldb/migrations/versions/0003_alter_tables_set_default_timestamp.sql b/analytics/src/analytics/integrations/etldb/migrations/versions/0003_alter_tables_set_default_timestamp.sql new file mode 100644 index 000000000..d1d325145 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/migrations/versions/0003_alter_tables_set_default_timestamp.sql @@ -0,0 +1,9 @@ +ALTER TABLE gh_deliverable ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP; +ALTER TABLE gh_deliverable_quad_map ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP; +ALTER TABLE gh_epic ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP; +ALTER TABLE gh_epic_deliverable_map ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP; +ALTER TABLE gh_issue ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP; +ALTER TABLE gh_issue_history ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP; +ALTER TABLE gh_issue_sprint_map ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP; +ALTER TABLE gh_sprint ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP; +ALTER TABLE gh_quad ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP; diff --git a/analytics/tests/integrations/test_etldb.py b/analytics/tests/integrations/test_etldb.py new file mode 100644 index 000000000..c67d49fc7 --- /dev/null +++ b/analytics/tests/integrations/test_etldb.py @@ -0,0 +1,93 @@ +"""Tests the code in integrations/etldb.py.""" + +import sqlalchemy +from analytics.integrations.etldb.etldb import EtlDb + + +class TestEtlDb: + """Test EtlDb methods.""" + + TEST_FILE_1 = "./tests/etldb_test_01.json" + + def test_instantiate_with_effective_date(self): + """Class method should return the correctly instantiated object.""" + effective_date = "2024-11-18" + etldb = EtlDb(effective_date) + + assert etldb.effective_date == effective_date + + def test_database_connection(self): + """Class method should return database connection object.""" + etldb = EtlDb() + connection = etldb.connection() + + assert isinstance(connection, sqlalchemy.Connection) + + def test_schema_versioning(self): + """Class methods should return appropriate values.""" + etldb = EtlDb() + has_versioning = etldb.schema_versioning_exists() + current_version = etldb.get_schema_version() + + if has_versioning: + assert current_version >= 2 + else: + assert current_version <= 1 + + def test_set_version_number(self): + """Class method should successfully update version.""" + etldb = EtlDb() + if not etldb.schema_versioning_exists(): + return + + original_version = etldb.get_schema_version() + next_version = original_version + 1 + result = etldb.set_schema_version(next_version) + + assert result is True + assert etldb.get_schema_version() == next_version + + # revert to keep testing env in same state + etldb.revert_to_schema_version(original_version) + + def test_set_bad_version_number(self): + """Class method should not update version.""" + etldb = EtlDb() + if not etldb.schema_versioning_exists(): + return + + current_version = etldb.get_schema_version() + previous_version = current_version - 1 + result = etldb.set_schema_version(previous_version) + + assert result is False + assert etldb.get_schema_version() == current_version + + def test_revert_to_version_number(self): + """Class method should successfully update version.""" + etldb = EtlDb() + if not etldb.schema_versioning_exists(): + return + + original_version = etldb.get_schema_version() + previous_version = original_version - 1 + result = etldb.revert_to_schema_version(previous_version) + + assert result is True + assert etldb.get_schema_version() == previous_version + + # revert the revert to keep testing env in same state + etldb.set_schema_version(original_version) + + def test_revert_to_bad_version_number(self): + """Class method should successfully update version.""" + etldb = EtlDb() + if not etldb.schema_versioning_exists(): + return + + original_version = etldb.get_schema_version() + previous_version = -99 + result = etldb.revert_to_schema_version(previous_version) + + assert result is False + assert etldb.get_schema_version() == original_version From 53bf5361b40252ba95b9275ec7a2673a94ab9f83 Mon Sep 17 00:00:00 2001 From: Mike H Date: Wed, 20 Nov 2024 11:32:40 -0500 Subject: [PATCH 4/4] [Issue #2792] Add schemas for extract metadata list API (#2900) ## Summary Fixes #2792 ### Time to review: 10 mins ## Changes proposed Add request/response schemas for ExtractMetadata listing API ## Context for reviewers Creates the input/output schemas needed by the API to support querying extracts and returning data. ## Additional information See unit tests --- api/openapi.generated.yml | 27 +++-- api/src/api/extracts_v1/extract_schema.py | 56 +++++++++ .../opportunities_v1/opportunity_schemas.py | 19 +-- api/src/api/schemas/response_schema.py | 13 +++ .../api/extracts_v1/test_extract_schema.py | 110 ++++++++++++++++++ 5 files changed, 198 insertions(+), 27 deletions(-) create mode 100644 api/src/api/extracts_v1/extract_schema.py create mode 100644 api/tests/src/api/extracts_v1/test_extract_schema.py diff --git a/api/openapi.generated.yml b/api/openapi.generated.yml index f11e5e235..768522e27 100644 --- a/api/openapi.generated.yml +++ b/api/openapi.generated.yml @@ -1163,8 +1163,19 @@ components: properties: download_path: type: string - description: The URL to download the attachment - example: https://... + description: The file's download path + file_size_bytes: + type: integer + description: The size of the file in bytes + example: 1024 + created_at: + type: string + format: date-time + readOnly: true + updated_at: + type: string + format: date-time + readOnly: true mime_type: type: string description: The MIME type of the attachment @@ -1177,10 +1188,6 @@ components: type: string description: A description of the attachment example: The full announcement NOFO - file_size_bytes: - type: integer - description: The size of the attachment in bytes - example: 10012 opportunity_attachment_type: description: The type of attachment example: !!python/object/apply:src.constants.lookup_constants.OpportunityAttachmentType @@ -1190,14 +1197,6 @@ components: - other type: - string - created_at: - type: string - format: date-time - readOnly: true - updated_at: - type: string - format: date-time - readOnly: true OpportunityWithAttachmentsV1: type: object properties: diff --git a/api/src/api/extracts_v1/extract_schema.py b/api/src/api/extracts_v1/extract_schema.py new file mode 100644 index 000000000..2bfc7678a --- /dev/null +++ b/api/src/api/extracts_v1/extract_schema.py @@ -0,0 +1,56 @@ +from src.api.schemas.extension import Schema, fields +from src.api.schemas.response_schema import AbstractResponseSchema, FileResponseSchema +from src.constants.lookup_constants import ExtractType +from src.pagination.pagination_schema import generate_pagination_schema + + +class ExtractMetadataFilterV1Schema(Schema): + extract_type = fields.Enum( + ExtractType, + allow_none=True, + metadata={ + "description": "The type of extract to filter by", + "example": "opportunities_csv", + }, + ) + start_date = fields.Date( + allow_none=True, + metadata={ + "description": "The start date for filtering extracts", + "example": "2023-10-01", + }, + ) + end_date = fields.Date( + allow_none=True, + metadata={ + "description": "The end date for filtering extracts", + "example": "2023-10-07", + }, + ) + + +class ExtractMetadataRequestSchema(AbstractResponseSchema): + filters = fields.Nested(ExtractMetadataFilterV1Schema()) + pagination = fields.Nested( + generate_pagination_schema( + "ExtractMetadataPaginationV1Schema", + ["created_at"], + ), + required=True, + ) + + +class ExtractMetadataResponseSchema(FileResponseSchema): + extract_metadata_id = fields.Integer( + metadata={"description": "The ID of the extract metadata", "example": 1} + ) + extract_type = fields.String( + metadata={"description": "The type of extract", "example": "opportunity_data_extract"} + ) + + +class ExtractMetadataListResponseSchema(AbstractResponseSchema): + data = fields.List( + fields.Nested(ExtractMetadataResponseSchema), + metadata={"description": "A list of extract metadata records"}, + ) diff --git a/api/src/api/opportunities_v1/opportunity_schemas.py b/api/src/api/opportunities_v1/opportunity_schemas.py index cd969ab35..471bf2438 100644 --- a/api/src/api/opportunities_v1/opportunity_schemas.py +++ b/api/src/api/opportunities_v1/opportunity_schemas.py @@ -1,7 +1,11 @@ from enum import StrEnum from src.api.schemas.extension import Schema, fields, validators -from src.api.schemas.response_schema import AbstractResponseSchema, PaginationMixinSchema +from src.api.schemas.response_schema import ( + AbstractResponseSchema, + FileResponseSchema, + PaginationMixinSchema, +) from src.api.schemas.search_schema import ( BoolSearchSchemaBuilder, DateSearchSchemaBuilder, @@ -309,13 +313,7 @@ class OpportunityV1Schema(Schema): updated_at = fields.DateTime(dump_only=True) -class OpportunityAttachmentV1Schema(Schema): - download_path = fields.String( - metadata={ - "description": "The URL to download the attachment", - "example": "https://...", - } - ) +class OpportunityAttachmentV1Schema(FileResponseSchema): mime_type = fields.String( metadata={"description": "The MIME type of the attachment", "example": "application/pdf"} ) @@ -328,9 +326,6 @@ class OpportunityAttachmentV1Schema(Schema): "example": "The full announcement NOFO", } ) - file_size_bytes = fields.Integer( - metadata={"description": "The size of the attachment in bytes", "example": 10012} - ) opportunity_attachment_type = fields.Enum( OpportunityAttachmentType, metadata={ @@ -338,8 +333,6 @@ class OpportunityAttachmentV1Schema(Schema): "example": OpportunityAttachmentType.NOTICE_OF_FUNDING_OPPORTUNITY, }, ) - created_at = fields.DateTime(dump_only=True) - updated_at = fields.DateTime(dump_only=True) class OpportunityWithAttachmentsV1Schema(OpportunityV1Schema): diff --git a/api/src/api/schemas/response_schema.py b/api/src/api/schemas/response_schema.py index 3f267ec3b..4220d4465 100644 --- a/api/src/api/schemas/response_schema.py +++ b/api/src/api/schemas/response_schema.py @@ -58,3 +58,16 @@ class ErrorResponseSchema(Schema): "example": "550e8400-e29b-41d4-a716-446655440000", } ) + + +class FileResponseSchema(Schema): + download_path = fields.String( + metadata={ + "description": "The file's download path", + }, + ) + file_size_bytes = fields.Integer( + metadata={"description": "The size of the file in bytes", "example": 1024} + ) + created_at = fields.DateTime(dump_only=True) + updated_at = fields.DateTime(dump_only=True) diff --git a/api/tests/src/api/extracts_v1/test_extract_schema.py b/api/tests/src/api/extracts_v1/test_extract_schema.py new file mode 100644 index 000000000..568aa54d3 --- /dev/null +++ b/api/tests/src/api/extracts_v1/test_extract_schema.py @@ -0,0 +1,110 @@ +from datetime import date + +import pytest +from marshmallow import ValidationError + +from src.api.extracts_v1.extract_schema import ( + ExtractMetadataListResponseSchema, + ExtractMetadataRequestSchema, + ExtractMetadataResponseSchema, +) +from src.db.models.extract_models import ExtractMetadata + + +@pytest.fixture +def sample_extract_metadata(): + return ExtractMetadata( + extract_metadata_id=1, + extract_type="opportunities_csv", + file_name="test_extract.csv", + file_path="/test/path/test_extract.csv", + file_size_bytes=2048, + ) + + +def test_request_schema_validation(): + schema = ExtractMetadataRequestSchema() + + # Test valid data + valid_data = { + "filters": { + "extract_type": "opportunities_csv", + "start_date": "2023-10-01", + "end_date": "2023-10-07", + }, + "pagination": { + "order_by": "created_at", + "page_offset": 1, + "page_size": 25, + "sort_direction": "ascending", + }, + } + result = schema.load(valid_data) + assert result["filters"]["extract_type"] == "opportunities_csv" + assert result["filters"]["start_date"] == date(2023, 10, 1) + assert result["filters"]["end_date"] == date(2023, 10, 7) + + # Test invalid extract_type + invalid_data = {"extract_type": "invalid_type", "start_date": "2023-10-01"} + with pytest.raises(ValidationError): + schema.load(invalid_data) + + +def test_response_schema_single(sample_extract_metadata): + schema = ExtractMetadataResponseSchema() + + sample_extract_metadata.download_path = "http://www.example.com" + extract_metadata = schema.dump(sample_extract_metadata) + + assert extract_metadata["download_path"] == "http://www.example.com" + + assert extract_metadata["extract_metadata_id"] == 1 + assert extract_metadata["extract_type"] == "opportunities_csv" + assert extract_metadata["download_path"] == "http://www.example.com" + assert extract_metadata["file_size_bytes"] == 2048 + + +def test_response_schema_list(sample_extract_metadata): + schema = ExtractMetadataListResponseSchema() + + # Create a list of two metadata records + metadata_list = { + "data": [ + sample_extract_metadata, + ExtractMetadata( + extract_metadata_id=2, + extract_type="opportunities_xml", + file_name="test_extract2.xml", + file_path="/test/path/test_extract2.xml", + file_size_bytes=1024, + ), + ] + } + + result = schema.dump(metadata_list) + + assert len(result["data"]) == 2 + assert result["data"][0]["extract_metadata_id"] == 1 + assert result["data"][0]["extract_type"] == "opportunities_csv" + assert result["data"][1]["extract_metadata_id"] == 2 + assert result["data"][1]["extract_type"] == "opportunities_xml" + + +def test_request_schema_null_values(): + schema = ExtractMetadataRequestSchema() + + # Test with some null values + data = { + "filters": {"extract_type": None, "start_date": "2023-10-01", "end_date": None}, + "pagination": { + "order_by": "created_at", + "page_offset": 1, + "page_size": 25, + "sort_direction": "ascending", + }, + } + + result = schema.load(data) + assert result["filters"]["extract_type"] is None + assert result["filters"]["start_date"] == date(2023, 10, 1) + assert result["filters"]["end_date"] is None