From f77609fe9d008fe391e1914dfca1ac225831591c Mon Sep 17 00:00:00 2001
From: "kai [they]" <coilysiren@gmail.com>
Date: Tue, 19 Nov 2024 10:57:37 -0800
Subject: [PATCH 1/4] [Issue #2475] Scale staging to the same numbers as prod
 (#2929)

## Context

Relates to #2475

## Motivation

I would like to do the load test in staging, a) because it should be
more stable than dev and b) because I don't want to mess up our prod
analytics
---
 infra/api/app-config/staging.tf      | 38 +++++++++++++++++-----------
 infra/frontend/app-config/staging.tf | 23 ++++++++++-------
 2 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/infra/api/app-config/staging.tf b/infra/api/app-config/staging.tf
index d670ee0cb..495056d3d 100644
--- a/infra/api/app-config/staging.tf
+++ b/infra/api/app-config/staging.tf
@@ -7,23 +7,31 @@ module "staging_config" {
   database_enable_http_endpoint   = true
   has_incident_management_service = local.has_incident_management_service
 
-  # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-auto-scaling.html
-  # https://us-east-1.console.aws.amazon.com/ecs/v2/clusters/api-staging/services/api-staging/health?region=us-east-1
-  # instance_desired_instance_count and instance_scaling_min_capacity are scaled for the average CPU and Memory
-  # seen over 12 months, as of November 2024 exlucing an outlier range around February 2024.
-  # With a minimum of 2, so CPU doesn't spike to infinity on deploys.
+  # # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-auto-scaling.html
+  # # https://us-east-1.console.aws.amazon.com/ecs/v2/clusters/api-staging/services/api-staging/health?region=us-east-1
+  # # instance_desired_instance_count and instance_scaling_min_capacity are scaled for the average CPU and Memory
+  # # seen over 12 months, as of November 2024 exlucing an outlier range around February 2024.
+  # # With a minimum of 2, so CPU doesn't spike to infinity on deploys.
+  # instance_desired_instance_count = 2
+  # instance_scaling_min_capacity   = 2
+  # # instance_scaling_max_capacity is 5x the instance_scaling_min_capacity
+  # instance_scaling_max_capacity = 10
+
+  # # https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/aurora-serverless-v2.setting-capacity.html
+  # # https://us-east-1.console.aws.amazon.com/rds/home?region=us-east-1#database:id=api-dev;is-cluster=true;tab=monitoring
+  # # database_min_capacity is average api-staging ServerlessDatabaseCapacity seen over 12 months, as of November 2024
+  # database_min_capacity = 2
+  # # database_max_capacity is 5x the database_min_capacity
+  # database_max_capacity   = 10
+  # database_instance_count = 2
+
+  # Temporarily scale staging to the same numbers as prod for the load test
   instance_desired_instance_count = 2
   instance_scaling_min_capacity   = 2
-  # instance_scaling_max_capacity is 5x the instance_scaling_min_capacity
-  instance_scaling_max_capacity = 10
-
-  # https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/aurora-serverless-v2.setting-capacity.html
-  # https://us-east-1.console.aws.amazon.com/rds/home?region=us-east-1#database:id=api-dev;is-cluster=true;tab=monitoring
-  # database_min_capacity is average api-staging ServerlessDatabaseCapacity seen over 12 months, as of November 2024
-  database_min_capacity = 2
-  # database_max_capacity is 5x the database_min_capacity
-  database_max_capacity   = 10
-  database_instance_count = 2
+  instance_scaling_max_capacity   = 10
+  database_min_capacity           = 20
+  database_max_capacity           = 128
+  database_instance_count         = 2
 
   has_search = true
   # https://docs.aws.amazon.com/opensearch-service/latest/developerguide/what-is.html#choosing-version
diff --git a/infra/frontend/app-config/staging.tf b/infra/frontend/app-config/staging.tf
index dd3a343e1..3aade1592 100644
--- a/infra/frontend/app-config/staging.tf
+++ b/infra/frontend/app-config/staging.tf
@@ -6,13 +6,18 @@ module "staging_config" {
   has_database                    = local.has_database
   has_incident_management_service = local.has_incident_management_service
 
-  # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-auto-scaling.html
-  # https://us-east-1.console.aws.amazon.com/ecs/v2/clusters/frontend-dev/services/frontend-dev/health?region=us-east-1
-  # instance_desired_instance_count and instance_scaling_min_capacity are scaled for the average CPU and Memory
-  # seen over 12 months, as of November 2024 exlucing an outlier range around February 2024.
-  # With a minimum of 2, so CPU doesn't spike to infinity on deploys.
-  instance_desired_instance_count = 2
-  instance_scaling_min_capacity   = 2
-  # instance_scaling_max_capacity is 5x the instance_scaling_min_capacity
-  instance_scaling_max_capacity = 10
+  # # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-auto-scaling.html
+  # # https://us-east-1.console.aws.amazon.com/ecs/v2/clusters/frontend-dev/services/frontend-dev/health?region=us-east-1
+  # # instance_desired_instance_count and instance_scaling_min_capacity are scaled for the average CPU and Memory
+  # # seen over 12 months, as of November 2024 exlucing an outlier range around February 2024.
+  # # With a minimum of 2, so CPU doesn't spike to infinity on deploys.
+  # instance_desired_instance_count = 2
+  # instance_scaling_min_capacity   = 2
+  # # instance_scaling_max_capacity is 5x the instance_scaling_min_capacity
+  # instance_scaling_max_capacity = 10
+
+  # Temporarily scale staging to the same numbers as prod for the load test
+  instance_desired_instance_count = 4
+  instance_scaling_min_capacity   = 4
+  instance_scaling_max_capacity   = 20
 }

From 7ea147130507b60f0b5de1bd619891a2cc440ed8 Mon Sep 17 00:00:00 2001
From: Aaron Couch <aaroncouch@navapbc.com>
Date: Tue, 19 Nov 2024 14:30:10 -0500
Subject: [PATCH 2/4] [Issue #2474] Update artillery data (#2928)

## Summary
Fixes #2474

### Time to review: __5 mins__

## Changes proposed
Adds data for artillery. Opp ids for dev/stage and prod were grabbed
from CSV exports. The prod was winnowed by randomly selecting 20,000 opp
ids. Agency data was grabbed by queries for the local, dev, prod dbs.
---
 DEVELOPMENT.md                        |   5 +-
 frontend/tests/artillery/params.json  | 168 --------------------------
 frontend/tests/artillery/processor.ts |  18 ++-
 3 files changed, 16 insertions(+), 175 deletions(-)
 delete mode 100644 frontend/tests/artillery/params.json

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index d5e46970b..cf82402f5 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -115,8 +115,9 @@ Any changes to features, tools, or workflows should include updates or additions
 To run the load test:
 
 1. Install artillery locally if you haven't done so with `npm install -g artillery@latest`
-2. `$ cd api` or or `$ cd frontend`
-3. `$ make load-test-<env>` where env is either `local`, `dev`, `staging`, or `production`
+2. For the frontend, download the required data from https://drive.google.com/file/d/1zknvVSRqL7xs8VGinztuKelfppYlgRoP and save "params.json" to `frontend/tests/artillery/params.json`
+3. `$ cd api` or or `$ cd frontend`
+4. `$ make load-test-<env>` where env is either `local`, `dev`, `staging`, or `production`
 
 - `make load-test-local`
   - requires running a local container in another console
diff --git a/frontend/tests/artillery/params.json b/frontend/tests/artillery/params.json
deleted file mode 100644
index ecb30e494..000000000
--- a/frontend/tests/artillery/params.json
+++ /dev/null
@@ -1,168 +0,0 @@
-{
-  "ids": {
-    "local": [
-      1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-      22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35
-    ],
-    "dev": [],
-    "prod": []
-  },
-  "queries": [
-    "test",
-    "grants",
-    "education",
-    "transportation",
-    "trauma",
-    "veterans"
-  ],
-  "status": ["posted", "forecasted", "closed", "archived", "none"],
-  "agencies": [
-    "ARPAH",
-    "USAID",
-    "USAID-AFG",
-    "USAID",
-    "USAID-ARM",
-    "USAID-AZE",
-    "USAID-BAN",
-    "USAID-BEN",
-    "AC",
-    "DC",
-    "USDA",
-    "USDA-AMS",
-    "USDA-FNS1",
-    "DOC",
-    "DOC-DOCNOAAERA",
-    "DOC-EDA",
-    "DOC-NIST",
-    "DOD",
-    "DOD-AMC-ACCAPGN",
-    "DOD-AMC-ACCAPGD",
-    "DOD-AFRL-AFRLDET8",
-    "DOD-AFRL",
-    "DOD-USAFA",
-    "DOD-AFOSR",
-    "DOD-DARPA-BTO",
-    "ED",
-    "DOE",
-    "DOE-ARPAE",
-    "DOE-GFO",
-    "DOE-01",
-    "PAMS",
-    "PAMS-SC",
-    "HHS",
-    "HHS-ACF-FYSB",
-    "HHS-ACF",
-    "HHS-ACF-CB",
-    "DHS",
-    "DHS-DHS",
-    "DHS-OPO",
-    "DHS-USCG",
-    "HUD",
-    "USDOJ",
-    "USDOJ-OJP-BJA",
-    "USDOJ-OJP-COPS",
-    "DOL",
-    "DOL-ETA-ILAB",
-    "DOL-ETA-CEO",
-    "DOS",
-    "DOS-NEA-AC",
-    "DOS-DRL",
-    "DOS-ECA",
-    "DOI",
-    "DOI-BIA",
-    "DOI-BLM",
-    "DOI-BOR",
-    "USDOT",
-    "USDOT-ORP",
-    "USDOT-DO-SIPPRA",
-    "USDOT-GCR",
-    "DOT",
-    "DOT-DOT X-50",
-    "DOT-RITA",
-    "DOT-FAA-FAA ARG",
-    "DOT-FRA",
-    "DOT-FHWA",
-    "DOT-FTA",
-    "DOT-FAA-FAA COE-AJFE",
-    "DOT-FAA-FAA COE-FAA JAMS",
-    "DOT-FAA-FAA COE-TTHP",
-    "DOT-MA",
-    "DOT-NHTSA",
-    "VA",
-    "VA-CSHF",
-    "VA-HPGPDP",
-    "VA-LSV",
-    "VA-NVSP",
-    "VA-NCAC",
-    "VA-OMHSP",
-    "VA-SSVF",
-    "VA-NCA",
-    "VA-VLGP",
-    "EPA",
-    "IMLS",
-    "MCC",
-    "NASA",
-    "NASA-HQ",
-    "NASA-JSC",
-    "NASA-SFC",
-    "NASA",
-    "NARA",
-    "NEA",
-    "NEH",
-    "NSF",
-    "SSA"
-  ],
-  "eligibility": [
-    "state_governments",
-    "county_governments",
-    "city_or_township_governments",
-    "special_district_governments",
-    "independent_school_districts",
-    "public_and_state_institutions_of_higher_education",
-    "private_institutions_of_higher_education",
-    "federally_recognized_native_american_tribal_governments",
-    "other_native_american_tribal_organizations",
-    "public_and_indian_housing_authorities",
-    "nonprofits_non_higher_education_with_501c3",
-    "nonprofits_non_higher_education_without_501c3",
-    "for_profit_organizations_other_than_small_businesses",
-    "small_businesses",
-    "other",
-    "unrestricted"
-  ],
-  "funding": [
-    "cooperative_agreement",
-    "grant",
-    "procurement_contract",
-    "other"
-  ],
-  "category": [
-    "recovery_act",
-    "agriculture",
-    "arts",
-    "business_and_commerce",
-    "community_development",
-    "consumer_protection",
-    "disaster_prevention_and_relief",
-    "education",
-    "employment_labor_and_training",
-    "energy",
-    "environment",
-    "food_and_nutrition",
-    "health",
-    "housing",
-    "humanities",
-    "information_and_statistics",
-    "infrastructure_investment_and_jobs_act",
-    "income_security_and_social_services",
-    "law_justice_and_legal_services",
-    "natural_resources",
-    "opportunity_zone_benefits",
-    "regional_development",
-    "science_technology_and_other_research_and_development",
-    "transportation",
-    "affordable_care_act",
-    "other"
-  ],
-  "pages": ["", "process", "research", "subscribe", "subscribe/confirmation"]
-}
diff --git a/frontend/tests/artillery/processor.ts b/frontend/tests/artillery/processor.ts
index a6a2681c2..3ff159246 100644
--- a/frontend/tests/artillery/processor.ts
+++ b/frontend/tests/artillery/processor.ts
@@ -8,13 +8,16 @@ type dataType = {
   queries: Array<string>;
   pages: Array<string>;
   status: Array<string>;
-  agencies: Array<string>;
+  agencies: {
+    [key: string]: Array<string>;
+  };
   funding: Array<string>;
   eligibility: Array<string>;
   category: Array<string>;
 };
 type globalVars = {
   $environment?: string;
+  env: string;
 };
 
 type returnVars = {
@@ -26,7 +29,7 @@ type returnVars = {
 
 // eslint-disable-next-line @typescript-eslint/require-await
 async function getOppId(context: { vars: dataType & returnVars & globalVars }) {
-  const env = context.vars.$environment as string;
+  const env = context.vars.env;
   context.vars.id =
     context.vars.ids[env][random(context.vars.ids[env].length - 1)];
 }
@@ -49,11 +52,14 @@ async function getStatic(context: { vars: returnVars }) {
 }
 
 // eslint-disable-next-line @typescript-eslint/require-await
-async function getSearchQuery(context: { vars: returnVars & dataType }) {
+async function getSearchQuery(context: {
+  vars: returnVars & dataType & globalVars;
+}) {
+  const env = context.vars.env;
   const { queries, status, agencies, eligibility, category } = context.vars;
   const queryParam = `query=${queries[random(queries.length - 1)]}`;
   const statusParam = `status=${status[random(status.length - 1)]}`;
-  const agencyParam = `agency=${agencies[random(agencies.length - 1)]}`;
+  const agencyParam = `agency=${agencies[env][random(agencies[env].length - 1)]}`;
   const categoryParam = `category=${category[random(category.length - 1)]}`;
   const eligibilityParam = `eligibility=${eligibility[random(eligibility.length - 1)]}`;
   const pagerParam = `page=${random(5)}`;
@@ -100,13 +106,15 @@ async function loadData(context: { vars: dataType & globalVars }) {
   // Dev and stage have the same data.
   const env =
     context.vars.$environment === "stage" ? "dev" : context.vars.$environment;
+
   const envs = new Set(["local", "dev", "stage", "prod"]);
   if (!env || !envs.has(env)) {
     throw new Error(`env ${env ?? ""} does not exist in env list`);
   }
   const path = "./tests/artillery/params.json";
   const file = await readFile(path, "utf8");
-  context.vars = JSON.parse(file) as dataType;
+  context.vars = JSON.parse(file) as dataType & globalVars;
+  context.vars.env = env;
 }
 
 function randomString(length: number) {

From c8e2006c9e6ce917a40c49ef3efc2bf8ed0d38ee Mon Sep 17 00:00:00 2001
From: David Dudas <david.dudas@intuitial.com>
Date: Tue, 19 Nov 2024 14:21:30 -0800
Subject: [PATCH 3/4] [Issue 2869] Add versioning to analytics db schema
 (#2870)

## Summary
Fixes #2869

### Time to review: __5 mins__

## Changes proposed
> What was added, updated, or removed in this PR.

1. Added a simple versioning pattern to enable schema extensibility,
including a table in the schema to keep track of current schema version
2. Refactored `make init-db` command to enable execution of multiple SQL
files, and update schema version accordingly upon the execution of each
file

## Context for reviewers
> Testing instructions, background context, more in-depth details of the
implementation, and anything else you'd like to call out or ask
reviewers. Explain how the changes were verified.

This is part of an effort to enable automated deployment of analytics db
to staging and prod.

## Additional information
> Screenshots, GIF demos, code examples or output to help show the
changes working as expected.
---
 .github/workflows/ci-analytics.yml            |   6 +-
 analytics/src/analytics/cli.py                |   4 +-
 .../analytics/integrations/etldb/__init__.py  |   8 +-
 .../src/analytics/integrations/etldb/etldb.py |  84 ++++++++++-
 .../src/analytics/integrations/etldb/main.py  | 139 ++++++++++++------
 .../versions/0001_create_tables_etldb.sql}    |   0
 .../versions/0002_create_schema_versions.sql  |   7 +
 ...003_alter_tables_set_default_timestamp.sql |   9 ++
 analytics/tests/integrations/test_etldb.py    |  93 ++++++++++++
 9 files changed, 299 insertions(+), 51 deletions(-)
 rename analytics/src/analytics/integrations/etldb/{create_etl_db.sql => migrations/versions/0001_create_tables_etldb.sql} (100%)
 create mode 100644 analytics/src/analytics/integrations/etldb/migrations/versions/0002_create_schema_versions.sql
 create mode 100644 analytics/src/analytics/integrations/etldb/migrations/versions/0003_alter_tables_set_default_timestamp.sql
 create mode 100644 analytics/tests/integrations/test_etldb.py

diff --git a/.github/workflows/ci-analytics.yml b/.github/workflows/ci-analytics.yml
index 6820e657c..6256cc074 100644
--- a/.github/workflows/ci-analytics.yml
+++ b/.github/workflows/ci-analytics.yml
@@ -39,12 +39,12 @@ jobs:
       - name: Run linting
         run: make lint
 
+      - name: Run database initialization
+        run: docker compose down --volumes && make init-db
+
       - name: Run tests
         run: make test-audit
 
-      - name: Run database initialization
-        run: make init-db
-
       # Both of these tasks are looking for github and slack auth
       # - name: Export GitHub data
       #   run: make gh-data-export
diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py
index 491b585b2..44ba05c5f 100644
--- a/analytics/src/analytics/cli.py
+++ b/analytics/src/analytics/cli.py
@@ -269,7 +269,7 @@ def export_json_to_database(delivery_file: Annotated[str, ISSUE_FILE_ARG]) -> No
 def initialize_database() -> None:
     """Initialize etl database."""
     logger.info("initializing database")
-    etldb.init_db()
+    etldb.initialize_database()
     logger.info("done")
 
 
@@ -296,7 +296,7 @@ def transform_and_load(
     dataset = EtlDataset.load_from_json_file(file_path=issue_file)
 
     # sync data to db
-    etldb.sync_db(dataset, datestamp)
+    etldb.sync_data(dataset, datestamp)
 
     # finish
     print("transform and load is done")
diff --git a/analytics/src/analytics/integrations/etldb/__init__.py b/analytics/src/analytics/integrations/etldb/__init__.py
index c1afd0946..3608faff7 100644
--- a/analytics/src/analytics/integrations/etldb/__init__.py
+++ b/analytics/src/analytics/integrations/etldb/__init__.py
@@ -1,11 +1,11 @@
 """Read and write data from/to delivery metrics database."""
 
 __all__ = [
-    "init_db",
-    "sync_db",
+    "initialize_database",
+    "sync_data",
 ]
 
 from analytics.integrations.etldb.main import (
-    init_db,
-    sync_db,
+    initialize_database,
+    sync_data,
 )
diff --git a/analytics/src/analytics/integrations/etldb/etldb.py b/analytics/src/analytics/integrations/etldb/etldb.py
index 5d9ca2671..63ffa18ed 100644
--- a/analytics/src/analytics/integrations/etldb/etldb.py
+++ b/analytics/src/analytics/integrations/etldb/etldb.py
@@ -2,7 +2,7 @@
 
 from enum import Enum
 
-from sqlalchemy import Connection
+from sqlalchemy import Connection, text
 
 from analytics.integrations.db import PostgresDbClient
 
@@ -36,6 +36,88 @@ def commit(self, connection: Connection) -> None:
         """Commit an open transaction."""
         connection.commit()
 
+    def get_schema_version(self) -> int:
+        """Select schema version from etl database."""
+        version = 0
+
+        if self.schema_versioning_exists():
+            result = self.connection().execute(
+                text("select version from schema_version"),
+            )
+            row = result.fetchone()
+            if row:
+                version = row[0]
+
+        return version
+
+    def set_schema_version(self, new_value: int) -> bool:
+        """Set schema version number."""
+        if not self.schema_versioning_exists():
+            return False
+
+        # sanity check new version number
+        current_version = self.get_schema_version()
+        if new_value < current_version:
+            message = (
+                "WARNING: cannot bump schema version "
+                f"from {current_version} to {new_value}"
+            )
+            print(message)
+            return False
+
+        if new_value > current_version:
+            cursor = self.connection()
+            cursor.execute(
+                text(
+                    "insert into schema_version (version) values (:new_value) "
+                    "on conflict(one_row) do update "
+                    "set version = :new_value",
+                ),
+                {"new_value": new_value},
+            )
+            self.commit(cursor)
+            return True
+
+        return False
+
+    def revert_to_schema_version(self, new_value: int) -> bool:
+        """Revert schema version number to the previous version."""
+        if not self.schema_versioning_exists():
+            return False
+
+        # sanity check new version number
+        current_version = self.get_schema_version()
+        if new_value != current_version - 1 or new_value < 0:
+            message = (
+                "WARNING: cannot bump schema version "
+                f"from {current_version} to {new_value}"
+            )
+            print(message)
+            return False
+
+        cursor = self.connection()
+        cursor.execute(
+            text(
+                "insert into schema_version (version) values (:new_value) "
+                "on conflict(one_row) do update "
+                "set version = :new_value",
+            ),
+            {"new_value": new_value},
+        )
+        self.commit(cursor)
+        return True
+
+    def schema_versioning_exists(self) -> bool:
+        """Determine whether schema version table exists."""
+        result = self.connection().execute(
+            text(
+                "select table_name from information_schema.tables "
+                "where table_name = 'schema_version'",
+            ),
+        )
+        row = result.fetchone()
+        return bool(row and row[0] == "schema_version")
+
 
 class EtlChangeType(Enum):
     """An enum to describe ETL change types."""
diff --git a/analytics/src/analytics/integrations/etldb/main.py b/analytics/src/analytics/integrations/etldb/main.py
index 3f300042d..81cc549b4 100644
--- a/analytics/src/analytics/integrations/etldb/main.py
+++ b/analytics/src/analytics/integrations/etldb/main.py
@@ -1,5 +1,7 @@
 """Integrate with database to read and write etl data."""
 
+import os
+import re
 from pathlib import Path
 
 from psycopg.errors import InsufficientPrivilege
@@ -17,35 +19,55 @@
 VERBOSE = False
 
 
-def init_db() -> None:
-    """Initialize etl database."""
-    # define the path to the sql file
-    parent_path = Path(__file__).resolve().parent
-    sql_path = f"{parent_path}/create_etl_db.sql"
-
-    # read sql file
-    with open(sql_path) as f:
-        sql = f.read()
+def initialize_database() -> None:
+    """
+    Create and/or update an etl database by applying a sequential set of migration scripts.
+
+    It applies the migrations using the following steps:
+    - Check the current schema version listed in the database
+    - Retrieve the list of migration scripts ordered by version
+    - If the current schema version is less than the version of the latest migration script
+      run the remaining migrations in order
+    - Bump the schema version in the database to the latest version
+    - If the current schema version matches the latest script, do nothing
+    """
+    # get connection to database
+    etldb = EtlDb()
+    current_version = etldb.get_schema_version()
+    print(f"current schema version: {current_version}")
+
+    # get all sql file paths and respective version numbers
+    sql_file_path_map = get_sql_file_paths()
+    all_versions = sorted(sql_file_path_map.keys())
+
+    # iterate sql files
+    migration_count = 0
+    for next_version in all_versions:
+        if next_version <= current_version:
+            continue
+        # read sql file
+        with open(sql_file_path_map[next_version]) as f:
+            sql = f.read()
+            # execute sql
+            print(f"applying migration for schema version: {next_version}")
+            print(f"migration source file: {sql_file_path_map[next_version]}")
+            cursor = etldb.connection()
+            cursor.execute(
+                text(sql),
+            )
+            # commit changes
+            etldb.commit(cursor)
+            # bump schema version number
+            _ = etldb.set_schema_version(next_version)
+            current_version = next_version
+            migration_count += 1
 
-    # execute sql
-    try:
-        db = EtlDb()
-        cursor = db.connection()
-        cursor.execute(
-            text(sql),
-        )
-        db.commit(cursor)
-    except (
-        RuntimeError,
-        ProgrammingError,
-        OperationalError,
-        InsufficientPrivilege,
-    ) as e:
-        message = f"FATAL: Failed to initialize db: {e}"
-        raise RuntimeError(message) from e
+    # summarize results in output
+    print(f"total migrations applied: {migration_count}")
+    print(f"new schema version: {current_version}")
 
 
-def sync_db(dataset: EtlDataset, effective: str) -> None:
+def sync_data(dataset: EtlDataset, effective: str) -> None:
     """Write github data to etl database."""
     # initialize a map of github id to db row id
     ghid_map: dict[EtlEntityType, dict[str, int]] = {
@@ -63,10 +85,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None:
         ghid_map[EtlEntityType.QUAD] = sync_quads(db, dataset)
         print(f"quad row(s) processed: {len(ghid_map[EtlEntityType.QUAD])}")
     except (
-        RuntimeError,
-        ProgrammingError,
-        OperationalError,
         InsufficientPrivilege,
+        OperationalError,
+        ProgrammingError,
+        RuntimeError,
     ) as e:
         message = f"FATAL: Failed to sync quad data: {e}"
         raise RuntimeError(message) from e
@@ -82,10 +104,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None:
             f"deliverable row(s) processed: {len(ghid_map[EtlEntityType.DELIVERABLE])}",
         )
     except (
-        RuntimeError,
-        ProgrammingError,
-        OperationalError,
         InsufficientPrivilege,
+        OperationalError,
+        ProgrammingError,
+        RuntimeError,
     ) as e:
         message = f"FATAL: Failed to sync deliverable data: {e}"
         raise RuntimeError(message) from e
@@ -95,10 +117,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None:
         ghid_map[EtlEntityType.SPRINT] = sync_sprints(db, dataset, ghid_map)
         print(f"sprint row(s) processed: {len(ghid_map[EtlEntityType.SPRINT])}")
     except (
-        RuntimeError,
-        ProgrammingError,
-        OperationalError,
         InsufficientPrivilege,
+        OperationalError,
+        ProgrammingError,
+        RuntimeError,
     ) as e:
         message = f"FATAL: Failed to sync sprint data: {e}"
         raise RuntimeError(message) from e
@@ -108,10 +130,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None:
         ghid_map[EtlEntityType.EPIC] = sync_epics(db, dataset, ghid_map)
         print(f"epic row(s) processed: {len(ghid_map[EtlEntityType.EPIC])}")
     except (
-        RuntimeError,
-        ProgrammingError,
-        OperationalError,
         InsufficientPrivilege,
+        OperationalError,
+        ProgrammingError,
+        RuntimeError,
     ) as e:
         message = f"FATAL: Failed to sync epic data: {e}"
         raise RuntimeError(message) from e
@@ -121,10 +143,10 @@ def sync_db(dataset: EtlDataset, effective: str) -> None:
         issue_map = sync_issues(db, dataset, ghid_map)
         print(f"issue row(s) processed: {len(issue_map)}")
     except (
-        RuntimeError,
-        ProgrammingError,
-        OperationalError,
         InsufficientPrivilege,
+        OperationalError,
+        ProgrammingError,
+        RuntimeError,
     ) as e:
         message = f"FATAL: Failed to sync issue data: {e}"
         raise RuntimeError(message) from e
@@ -190,3 +212,38 @@ def sync_quads(db: EtlDb, dataset: EtlDataset) -> dict:
                 f"QUAD '{ghid}' title = '{quad_df['quad_name']}', row_id = {result[ghid]}",
             )
     return result
+
+
+def get_sql_file_paths() -> dict[int, str]:
+    """Get all sql files needed for database initialization."""
+    result = {}
+
+    # define the path to the sql files
+    sql_file_directory = f"{Path(__file__).resolve().parent}/migrations/versions"
+
+    # get list of sorted filenames
+    filename_list = sorted(os.listdir(sql_file_directory))
+
+    # expected filename format: {4_digit_version_number}_{short_description}.sql
+    # example: 0003_alter_tables_set_default_timestamp.sql
+    pattern = re.compile(r"^\d\d\d\d_.+\.sql$")
+
+    # compile dict of results
+    for filename in filename_list:
+        # validate filename format
+        if not pattern.match(filename):
+            message = f"FATAL: malformed db migration filename: {filename}"
+            raise RuntimeError(message)
+
+        # extrace version number from filename
+        version = int(filename[:4])
+
+        # do not allow duplicate version number
+        if version in result:
+            message = f"FATAL: Duplicate db migration version number: {version} "
+            raise RuntimeError(message)
+
+        # map the version number to the file path
+        result[version] = f"{sql_file_directory}/{filename}"
+
+    return result
diff --git a/analytics/src/analytics/integrations/etldb/create_etl_db.sql b/analytics/src/analytics/integrations/etldb/migrations/versions/0001_create_tables_etldb.sql
similarity index 100%
rename from analytics/src/analytics/integrations/etldb/create_etl_db.sql
rename to analytics/src/analytics/integrations/etldb/migrations/versions/0001_create_tables_etldb.sql
diff --git a/analytics/src/analytics/integrations/etldb/migrations/versions/0002_create_schema_versions.sql b/analytics/src/analytics/integrations/etldb/migrations/versions/0002_create_schema_versions.sql
new file mode 100644
index 000000000..31d1d0eaa
--- /dev/null
+++ b/analytics/src/analytics/integrations/etldb/migrations/versions/0002_create_schema_versions.sql
@@ -0,0 +1,7 @@
+CREATE TABLE IF NOT EXISTS schema_version (
+    one_row BOOL GENERATED ALWAYS AS (TRUE) STORED,
+	version INTEGER NOT NULL,
+	t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+	t_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+);
+CREATE UNIQUE INDEX IF NOT EXISTS sv_i1 ON schema_version(one_row);
diff --git a/analytics/src/analytics/integrations/etldb/migrations/versions/0003_alter_tables_set_default_timestamp.sql b/analytics/src/analytics/integrations/etldb/migrations/versions/0003_alter_tables_set_default_timestamp.sql
new file mode 100644
index 000000000..d1d325145
--- /dev/null
+++ b/analytics/src/analytics/integrations/etldb/migrations/versions/0003_alter_tables_set_default_timestamp.sql
@@ -0,0 +1,9 @@
+ALTER TABLE gh_deliverable ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP;
+ALTER TABLE gh_deliverable_quad_map ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP;
+ALTER TABLE gh_epic ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP;
+ALTER TABLE gh_epic_deliverable_map ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP;
+ALTER TABLE gh_issue ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP;
+ALTER TABLE gh_issue_history ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP;
+ALTER TABLE gh_issue_sprint_map ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP;
+ALTER TABLE gh_sprint ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP;
+ALTER TABLE gh_quad ALTER COLUMN t_modified SET DEFAULT CURRENT_TIMESTAMP;
diff --git a/analytics/tests/integrations/test_etldb.py b/analytics/tests/integrations/test_etldb.py
new file mode 100644
index 000000000..c67d49fc7
--- /dev/null
+++ b/analytics/tests/integrations/test_etldb.py
@@ -0,0 +1,93 @@
+"""Tests the code in integrations/etldb.py."""
+
+import sqlalchemy
+from analytics.integrations.etldb.etldb import EtlDb
+
+
+class TestEtlDb:
+    """Test EtlDb methods."""
+
+    TEST_FILE_1 = "./tests/etldb_test_01.json"
+
+    def test_instantiate_with_effective_date(self):
+        """Class method should return the correctly instantiated object."""
+        effective_date = "2024-11-18"
+        etldb = EtlDb(effective_date)
+
+        assert etldb.effective_date == effective_date
+
+    def test_database_connection(self):
+        """Class method should return database connection object."""
+        etldb = EtlDb()
+        connection = etldb.connection()
+
+        assert isinstance(connection, sqlalchemy.Connection)
+
+    def test_schema_versioning(self):
+        """Class methods should return appropriate values."""
+        etldb = EtlDb()
+        has_versioning = etldb.schema_versioning_exists()
+        current_version = etldb.get_schema_version()
+
+        if has_versioning:
+            assert current_version >= 2
+        else:
+            assert current_version <= 1
+
+    def test_set_version_number(self):
+        """Class method should successfully update version."""
+        etldb = EtlDb()
+        if not etldb.schema_versioning_exists():
+            return
+
+        original_version = etldb.get_schema_version()
+        next_version = original_version + 1
+        result = etldb.set_schema_version(next_version)
+
+        assert result is True
+        assert etldb.get_schema_version() == next_version
+
+        # revert to keep testing env in same state
+        etldb.revert_to_schema_version(original_version)
+
+    def test_set_bad_version_number(self):
+        """Class method should not update version."""
+        etldb = EtlDb()
+        if not etldb.schema_versioning_exists():
+            return
+
+        current_version = etldb.get_schema_version()
+        previous_version = current_version - 1
+        result = etldb.set_schema_version(previous_version)
+
+        assert result is False
+        assert etldb.get_schema_version() == current_version
+
+    def test_revert_to_version_number(self):
+        """Class method should successfully update version."""
+        etldb = EtlDb()
+        if not etldb.schema_versioning_exists():
+            return
+
+        original_version = etldb.get_schema_version()
+        previous_version = original_version - 1
+        result = etldb.revert_to_schema_version(previous_version)
+
+        assert result is True
+        assert etldb.get_schema_version() == previous_version
+
+        # revert the revert to keep testing env in same state
+        etldb.set_schema_version(original_version)
+
+    def test_revert_to_bad_version_number(self):
+        """Class method should successfully update version."""
+        etldb = EtlDb()
+        if not etldb.schema_versioning_exists():
+            return
+
+        original_version = etldb.get_schema_version()
+        previous_version = -99
+        result = etldb.revert_to_schema_version(previous_version)
+
+        assert result is False
+        assert etldb.get_schema_version() == original_version

From 53bf5361b40252ba95b9275ec7a2673a94ab9f83 Mon Sep 17 00:00:00 2001
From: Mike H <mike.huneke@focusconsulting.io>
Date: Wed, 20 Nov 2024 11:32:40 -0500
Subject: [PATCH 4/4] [Issue #2792] Add schemas for extract metadata list API
 (#2900)

## Summary
Fixes #2792

### Time to review: 10 mins

## Changes proposed
Add request/response schemas for ExtractMetadata listing API

## Context for reviewers
Creates the input/output schemas needed by the API to support querying
extracts and returning data.

## Additional information
See unit tests
---
 api/openapi.generated.yml                     |  27 +++--
 api/src/api/extracts_v1/extract_schema.py     |  56 +++++++++
 .../opportunities_v1/opportunity_schemas.py   |  19 +--
 api/src/api/schemas/response_schema.py        |  13 +++
 .../api/extracts_v1/test_extract_schema.py    | 110 ++++++++++++++++++
 5 files changed, 198 insertions(+), 27 deletions(-)
 create mode 100644 api/src/api/extracts_v1/extract_schema.py
 create mode 100644 api/tests/src/api/extracts_v1/test_extract_schema.py

diff --git a/api/openapi.generated.yml b/api/openapi.generated.yml
index f11e5e235..768522e27 100644
--- a/api/openapi.generated.yml
+++ b/api/openapi.generated.yml
@@ -1163,8 +1163,19 @@ components:
       properties:
         download_path:
           type: string
-          description: The URL to download the attachment
-          example: https://...
+          description: The file's download path
+        file_size_bytes:
+          type: integer
+          description: The size of the file in bytes
+          example: 1024
+        created_at:
+          type: string
+          format: date-time
+          readOnly: true
+        updated_at:
+          type: string
+          format: date-time
+          readOnly: true
         mime_type:
           type: string
           description: The MIME type of the attachment
@@ -1177,10 +1188,6 @@ components:
           type: string
           description: A description of the attachment
           example: The full announcement NOFO
-        file_size_bytes:
-          type: integer
-          description: The size of the attachment in bytes
-          example: 10012
         opportunity_attachment_type:
           description: The type of attachment
           example: !!python/object/apply:src.constants.lookup_constants.OpportunityAttachmentType
@@ -1190,14 +1197,6 @@ components:
           - other
           type:
           - string
-        created_at:
-          type: string
-          format: date-time
-          readOnly: true
-        updated_at:
-          type: string
-          format: date-time
-          readOnly: true
     OpportunityWithAttachmentsV1:
       type: object
       properties:
diff --git a/api/src/api/extracts_v1/extract_schema.py b/api/src/api/extracts_v1/extract_schema.py
new file mode 100644
index 000000000..2bfc7678a
--- /dev/null
+++ b/api/src/api/extracts_v1/extract_schema.py
@@ -0,0 +1,56 @@
+from src.api.schemas.extension import Schema, fields
+from src.api.schemas.response_schema import AbstractResponseSchema, FileResponseSchema
+from src.constants.lookup_constants import ExtractType
+from src.pagination.pagination_schema import generate_pagination_schema
+
+
+class ExtractMetadataFilterV1Schema(Schema):
+    extract_type = fields.Enum(
+        ExtractType,
+        allow_none=True,
+        metadata={
+            "description": "The type of extract to filter by",
+            "example": "opportunities_csv",
+        },
+    )
+    start_date = fields.Date(
+        allow_none=True,
+        metadata={
+            "description": "The start date for filtering extracts",
+            "example": "2023-10-01",
+        },
+    )
+    end_date = fields.Date(
+        allow_none=True,
+        metadata={
+            "description": "The end date for filtering extracts",
+            "example": "2023-10-07",
+        },
+    )
+
+
+class ExtractMetadataRequestSchema(AbstractResponseSchema):
+    filters = fields.Nested(ExtractMetadataFilterV1Schema())
+    pagination = fields.Nested(
+        generate_pagination_schema(
+            "ExtractMetadataPaginationV1Schema",
+            ["created_at"],
+        ),
+        required=True,
+    )
+
+
+class ExtractMetadataResponseSchema(FileResponseSchema):
+    extract_metadata_id = fields.Integer(
+        metadata={"description": "The ID of the extract metadata", "example": 1}
+    )
+    extract_type = fields.String(
+        metadata={"description": "The type of extract", "example": "opportunity_data_extract"}
+    )
+
+
+class ExtractMetadataListResponseSchema(AbstractResponseSchema):
+    data = fields.List(
+        fields.Nested(ExtractMetadataResponseSchema),
+        metadata={"description": "A list of extract metadata records"},
+    )
diff --git a/api/src/api/opportunities_v1/opportunity_schemas.py b/api/src/api/opportunities_v1/opportunity_schemas.py
index cd969ab35..471bf2438 100644
--- a/api/src/api/opportunities_v1/opportunity_schemas.py
+++ b/api/src/api/opportunities_v1/opportunity_schemas.py
@@ -1,7 +1,11 @@
 from enum import StrEnum
 
 from src.api.schemas.extension import Schema, fields, validators
-from src.api.schemas.response_schema import AbstractResponseSchema, PaginationMixinSchema
+from src.api.schemas.response_schema import (
+    AbstractResponseSchema,
+    FileResponseSchema,
+    PaginationMixinSchema,
+)
 from src.api.schemas.search_schema import (
     BoolSearchSchemaBuilder,
     DateSearchSchemaBuilder,
@@ -309,13 +313,7 @@ class OpportunityV1Schema(Schema):
     updated_at = fields.DateTime(dump_only=True)
 
 
-class OpportunityAttachmentV1Schema(Schema):
-    download_path = fields.String(
-        metadata={
-            "description": "The URL to download the attachment",
-            "example": "https://...",
-        }
-    )
+class OpportunityAttachmentV1Schema(FileResponseSchema):
     mime_type = fields.String(
         metadata={"description": "The MIME type of the attachment", "example": "application/pdf"}
     )
@@ -328,9 +326,6 @@ class OpportunityAttachmentV1Schema(Schema):
             "example": "The full announcement NOFO",
         }
     )
-    file_size_bytes = fields.Integer(
-        metadata={"description": "The size of the attachment in bytes", "example": 10012}
-    )
     opportunity_attachment_type = fields.Enum(
         OpportunityAttachmentType,
         metadata={
@@ -338,8 +333,6 @@ class OpportunityAttachmentV1Schema(Schema):
             "example": OpportunityAttachmentType.NOTICE_OF_FUNDING_OPPORTUNITY,
         },
     )
-    created_at = fields.DateTime(dump_only=True)
-    updated_at = fields.DateTime(dump_only=True)
 
 
 class OpportunityWithAttachmentsV1Schema(OpportunityV1Schema):
diff --git a/api/src/api/schemas/response_schema.py b/api/src/api/schemas/response_schema.py
index 3f267ec3b..4220d4465 100644
--- a/api/src/api/schemas/response_schema.py
+++ b/api/src/api/schemas/response_schema.py
@@ -58,3 +58,16 @@ class ErrorResponseSchema(Schema):
             "example": "550e8400-e29b-41d4-a716-446655440000",
         }
     )
+
+
+class FileResponseSchema(Schema):
+    download_path = fields.String(
+        metadata={
+            "description": "The file's download path",
+        },
+    )
+    file_size_bytes = fields.Integer(
+        metadata={"description": "The size of the file in bytes", "example": 1024}
+    )
+    created_at = fields.DateTime(dump_only=True)
+    updated_at = fields.DateTime(dump_only=True)
diff --git a/api/tests/src/api/extracts_v1/test_extract_schema.py b/api/tests/src/api/extracts_v1/test_extract_schema.py
new file mode 100644
index 000000000..568aa54d3
--- /dev/null
+++ b/api/tests/src/api/extracts_v1/test_extract_schema.py
@@ -0,0 +1,110 @@
+from datetime import date
+
+import pytest
+from marshmallow import ValidationError
+
+from src.api.extracts_v1.extract_schema import (
+    ExtractMetadataListResponseSchema,
+    ExtractMetadataRequestSchema,
+    ExtractMetadataResponseSchema,
+)
+from src.db.models.extract_models import ExtractMetadata
+
+
+@pytest.fixture
+def sample_extract_metadata():
+    return ExtractMetadata(
+        extract_metadata_id=1,
+        extract_type="opportunities_csv",
+        file_name="test_extract.csv",
+        file_path="/test/path/test_extract.csv",
+        file_size_bytes=2048,
+    )
+
+
+def test_request_schema_validation():
+    schema = ExtractMetadataRequestSchema()
+
+    # Test valid data
+    valid_data = {
+        "filters": {
+            "extract_type": "opportunities_csv",
+            "start_date": "2023-10-01",
+            "end_date": "2023-10-07",
+        },
+        "pagination": {
+            "order_by": "created_at",
+            "page_offset": 1,
+            "page_size": 25,
+            "sort_direction": "ascending",
+        },
+    }
+    result = schema.load(valid_data)
+    assert result["filters"]["extract_type"] == "opportunities_csv"
+    assert result["filters"]["start_date"] == date(2023, 10, 1)
+    assert result["filters"]["end_date"] == date(2023, 10, 7)
+
+    # Test invalid extract_type
+    invalid_data = {"extract_type": "invalid_type", "start_date": "2023-10-01"}
+    with pytest.raises(ValidationError):
+        schema.load(invalid_data)
+
+
+def test_response_schema_single(sample_extract_metadata):
+    schema = ExtractMetadataResponseSchema()
+
+    sample_extract_metadata.download_path = "http://www.example.com"
+    extract_metadata = schema.dump(sample_extract_metadata)
+
+    assert extract_metadata["download_path"] == "http://www.example.com"
+
+    assert extract_metadata["extract_metadata_id"] == 1
+    assert extract_metadata["extract_type"] == "opportunities_csv"
+    assert extract_metadata["download_path"] == "http://www.example.com"
+    assert extract_metadata["file_size_bytes"] == 2048
+
+
+def test_response_schema_list(sample_extract_metadata):
+    schema = ExtractMetadataListResponseSchema()
+
+    # Create a list of two metadata records
+    metadata_list = {
+        "data": [
+            sample_extract_metadata,
+            ExtractMetadata(
+                extract_metadata_id=2,
+                extract_type="opportunities_xml",
+                file_name="test_extract2.xml",
+                file_path="/test/path/test_extract2.xml",
+                file_size_bytes=1024,
+            ),
+        ]
+    }
+
+    result = schema.dump(metadata_list)
+
+    assert len(result["data"]) == 2
+    assert result["data"][0]["extract_metadata_id"] == 1
+    assert result["data"][0]["extract_type"] == "opportunities_csv"
+    assert result["data"][1]["extract_metadata_id"] == 2
+    assert result["data"][1]["extract_type"] == "opportunities_xml"
+
+
+def test_request_schema_null_values():
+    schema = ExtractMetadataRequestSchema()
+
+    # Test with some null values
+    data = {
+        "filters": {"extract_type": None, "start_date": "2023-10-01", "end_date": None},
+        "pagination": {
+            "order_by": "created_at",
+            "page_offset": 1,
+            "page_size": 25,
+            "sort_direction": "ascending",
+        },
+    }
+
+    result = schema.load(data)
+    assert result["filters"]["extract_type"] is None
+    assert result["filters"]["start_date"] == date(2023, 10, 1)
+    assert result["filters"]["end_date"] is None