From 5216b2fa65bbb73b6c30260db6b746e3437ccd22 Mon Sep 17 00:00:00 2001 From: Anna Date: Tue, 14 Jan 2025 17:42:36 -0800 Subject: [PATCH 1/8] added neon db url and etl workflow --- .github/workflows/etl_to_neon.yml | 28 ++++++++++++++++++++++++++++ backend/api/config.py | 1 + backend/database/session.py | 2 +- 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/etl_to_neon.yml diff --git a/.github/workflows/etl_to_neon.yml b/.github/workflows/etl_to_neon.yml new file mode 100644 index 00000000..efe52ec7 --- /dev/null +++ b/.github/workflows/etl_to_neon.yml @@ -0,0 +1,28 @@ +name: ETL to neon + +# Workflow triggers +on: + schedule: + - cron: "0 2 * * 0" # Runs at 2am UTC every Sunday + workflow_dispatch: # Allows manual triggering of the workflow + +jobs: + neon-etl: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: ETL data to Neon DB + run: | + python -m backend.etl.tsunami_data_handler \ No newline at end of file diff --git a/backend/api/config.py b/backend/api/config.py index 86d37386..0f657de4 100644 --- a/backend/api/config.py +++ b/backend/api/config.py @@ -14,6 +14,7 @@ class Settings(BaseSettings): postgres_db: str postgis_version: str frontend_host: str + neon_url: str database_url: str localhost_database_url: str database_url_sqlalchemy: str diff --git a/backend/database/session.py b/backend/database/session.py index 4685ef5a..bb980571 100644 --- a/backend/database/session.py +++ b/backend/database/session.py @@ -3,7 +3,7 @@ from backend.api.config import settings # Set up the database engine using settings -engine = create_engine(settings.database_url_sqlalchemy, echo=True) +engine = create_engine(settings.neon_url, echo=True) # Create a session factory SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) From fd81eb3de0d6e925449707d8aaa1101d4b9dcad0 Mon Sep 17 00:00:00 2001 From: Anna Date: Tue, 14 Jan 2025 17:48:16 -0800 Subject: [PATCH 2/8] added push trigger --- .github/workflows/etl_to_neon.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/etl_to_neon.yml b/.github/workflows/etl_to_neon.yml index efe52ec7..665ae16d 100644 --- a/.github/workflows/etl_to_neon.yml +++ b/.github/workflows/etl_to_neon.yml @@ -4,6 +4,9 @@ name: ETL to neon on: schedule: - cron: "0 2 * * 0" # Runs at 2am UTC every Sunday + push: + branches: + - add_etl_workflow workflow_dispatch: # Allows manual triggering of the workflow jobs: From b6354776a84433d247bf6d6e9760742e2f4b2afa Mon Sep 17 00:00:00 2001 From: Anna Date: Wed, 15 Jan 2025 18:14:01 -0800 Subject: [PATCH 3/8] added neon url --- .github/workflows/env_vars.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/env_vars.yml b/.github/workflows/env_vars.yml index bcd66e9b..c175ad77 100644 --- a/.github/workflows/env_vars.yml +++ b/.github/workflows/env_vars.yml @@ -17,6 +17,7 @@ jobs: envkey_POSTGRES_DB: ${{ secrets.POSTGRES_DB }} envkey_POSTGIS_VERSION: ${{ secrets.POSTGIS_VERSION }} + envkey_NEON_URL: ${{ secrets.NEON_URL }} envkey_FRONTEND_HOST: ${{ secrets.FRONTEND_HOST }} envkey_DATABASE_URL: ${{ secrets.DATABASE_URL }} envkey_LOCALHOST_DATABASE_URL: ${{ secrets.LOCALHOST_DATABASE_URL }} From d04a7de1a4babb1c2ce1ead241d59448bcd718a2 Mon Sep 17 00:00:00 2001 From: Anna Date: Wed, 15 Jan 2025 18:40:53 -0800 Subject: [PATCH 4/8] updated env example --- .env.example | 1 + 1 file changed, 1 insertion(+) diff --git a/.env.example b/.env.example index 423401bc..287cf680 100644 --- a/.env.example +++ b/.env.example @@ -5,6 +5,7 @@ POSTGRES_DB=qsdatabase POSTGIS_VERSION=3.5.0 # Backend Environment Variables +NEON_URL=dummy-neon-url FRONTEND_HOST=http://localhost:3000 DATABASE_URL=postgresql://postgres:password@db:5432/qsdatabase # Connection string for the PostgreSQL database LOCALHOST_DATABASE_URL=postgresql://postgres:password@localhost:5432/qsdatabase #Connection string for the PostgreSQL database when running locally From f227d3db9c6697133038123ce8fb54a8f3e095b3 Mon Sep 17 00:00:00 2001 From: Anna Date: Wed, 15 Jan 2025 19:10:23 -0800 Subject: [PATCH 5/8] added env artifact to the workflow --- .github/workflows/etl_to_neon.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/workflows/etl_to_neon.yml b/.github/workflows/etl_to_neon.yml index 665ae16d..92e4019f 100644 --- a/.github/workflows/etl_to_neon.yml +++ b/.github/workflows/etl_to_neon.yml @@ -25,6 +25,21 @@ jobs: - name: Install dependencies run: | pip install -r requirements.txt + - name: Get Run ID of Most Recent Successful Run + id: get_run_id + run: | + response=$(curl -s -H "Authorization: token ${{ secrets.GH_PAT }}" \ + "https://api.github.com/repos/sfbrigade/datasci-earthquake/actions/workflows/env_vars.yml/runs?status=completed&conclusion=success") + run_id=$(echo $response | jq '.workflow_runs[0].id') + echo "Run ID: $run_id" + echo "run_id=$run_id" >> $GITHUB_ENV + - name: Download .env Artifact + uses: actions/download-artifact@v4 + with: + name: env-file + github-token: ${{ secrets.GH_PAT }} + repository: sfbrigade/datasci-earthquake + run-id: ${{ env.run_id }} - name: ETL data to Neon DB run: | From 19d257a40e35c02d8024dd52e237e03a8c23826a Mon Sep 17 00:00:00 2001 From: Anna Date: Mon, 20 Jan 2025 13:25:20 -0800 Subject: [PATCH 6/8] updated etl workflow --- .github/workflows/etl_to_neon.yml | 4 +++- backend/api/models/liquefaction_zones.py | 4 +--- backend/api/models/soft_story_properties.py | 4 +++- backend/etl/data_handler.py | 12 ------------ backend/etl/liquefaction_data_handler.py | 9 +++++++++ backend/etl/soft_story_properties_data_handler.py | 6 +++--- 6 files changed, 19 insertions(+), 20 deletions(-) diff --git a/.github/workflows/etl_to_neon.yml b/.github/workflows/etl_to_neon.yml index 92e4019f..dde74b33 100644 --- a/.github/workflows/etl_to_neon.yml +++ b/.github/workflows/etl_to_neon.yml @@ -43,4 +43,6 @@ jobs: - name: ETL data to Neon DB run: | - python -m backend.etl.tsunami_data_handler \ No newline at end of file + python -m backend.etl.tsunami_data_handler + python -m backend.etl.soft_story_properties_data_handler + python -m backend.etl.liquefaction_data_handler \ No newline at end of file diff --git a/backend/api/models/liquefaction_zones.py b/backend/api/models/liquefaction_zones.py index d56ea376..09d9216d 100644 --- a/backend/api/models/liquefaction_zones.py +++ b/backend/api/models/liquefaction_zones.py @@ -22,9 +22,7 @@ class LiquefactionZone(Base): __tablename__ = "liquefaction_zones" - identifier: Mapped[int] = mapped_column( - Integer, primary_key=True, autoincrement=True - ) + identifier: Mapped[int] = mapped_column(String, primary_key=True) geometry: Mapped[Geometry] = mapped_column(Geometry("MULTIPOLYGON", srid=4326)) liq: Mapped[str] = mapped_column(String) shape_length: Mapped[float] = mapped_column(Float) diff --git a/backend/api/models/soft_story_properties.py b/backend/api/models/soft_story_properties.py index 1d3d41f0..2cdf2ed6 100644 --- a/backend/api/models/soft_story_properties.py +++ b/backend/api/models/soft_story_properties.py @@ -29,7 +29,9 @@ class SoftStoryProperty(Base): block: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True) lot: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True) parcel_number: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True) - property_address: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True) + property_address: Mapped[str] = mapped_column( + String(_STRING_LENGTH), nullable=True, unique=True + ) address: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=False) tier: Mapped[int] = mapped_column(Integer, nullable=True) status: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True) diff --git a/backend/etl/data_handler.py b/backend/etl/data_handler.py index b2998397..d66660d6 100644 --- a/backend/etl/data_handler.py +++ b/backend/etl/data_handler.py @@ -84,15 +84,3 @@ def bulk_insert_data(self, data_dicts: list[dict], id_field: str): stmt = stmt.on_conflict_do_nothing(index_elements=[id_field]) db.execute(stmt) db.commit() - - def bulk_insert_data_autoincremented(self, data_dicts: list[dict]): - """ - Inserts the list of dictionaries with SQLAlchemy-generated IDs into the database table as - SQLAlchemy objects - """ - # TODO: Implement logic to upsert only changed data - with next(get_db()) as db: - stmt = pg_insert(self.table).values(data_dicts) - stmt = stmt.on_conflict_do_nothing() - db.execute(stmt) - db.commit() diff --git a/backend/etl/liquefaction_data_handler.py b/backend/etl/liquefaction_data_handler.py index 5e001d88..816c3ecc 100644 --- a/backend/etl/liquefaction_data_handler.py +++ b/backend/etl/liquefaction_data_handler.py @@ -3,6 +3,7 @@ from backend.api.models.liquefaction_zones import LiquefactionZone from shapely.geometry import shape from geoalchemy2.shape import from_shape +from geoalchemy2.functions import ST_Simplify _LIQUEFACTION_URL = "https://data.sfgov.org/resource/i4t7-35u3.geojson" @@ -22,7 +23,11 @@ def parse_data(self, data: dict) -> list[dict]: Each dictionary represents a row for the database table. Geometry data is converted into a GeoAlchemy-compatible MultiPolygon with srid 4326. + + To avoid API timeout errors, this method reduces the complexity of the multipolygons by collapsing points that are closer than 0.00001 degrees into a single point. + Note that the dataset contains the length and area of the original multipoygons. """ + tolerance = 0.0001 features = data["features"] parsed_data = [] for feature in features: @@ -30,7 +35,11 @@ def parse_data(self, data: dict) -> list[dict]: geometry = feature.get("geometry", {}) multipolygon = shape(geometry) geoalchemy_multipolygon = from_shape(multipolygon, srid=4326) + simplified_geoalchemy_multipolygon = ST_Simplify( + geoalchemy_multipolygon, tolerance + ) liquefaction_zone = { + "identifier": f'{properties.get("shape_leng")}-{properties.get("shape_area")}-{properties.get("liq")}', "liq": properties.get("liq"), "geometry": geoalchemy_multipolygon, "shape_length": properties.get("shape_leng"), diff --git a/backend/etl/soft_story_properties_data_handler.py b/backend/etl/soft_story_properties_data_handler.py index 50c80563..11a95d98 100644 --- a/backend/etl/soft_story_properties_data_handler.py +++ b/backend/etl/soft_story_properties_data_handler.py @@ -4,7 +4,7 @@ from sqlalchemy.ext.declarative import DeclarativeMeta from dotenv import load_dotenv import os -from etl.mapbox_geojson_manager import MapboxGeojsonManager +from backend.etl.mapbox_geojson_manager import MapboxGeojsonManager from typing import Dict, Tuple @@ -32,7 +32,7 @@ def fill_in_missing_mapbox_points( return parsed_data mapbox_coordinates_map: Dict[str, Tuple[float, float]] = ( - self.mapbox_geojson_manager.batch_geocode_addresses(addresses) + self.mapbox_geojson_manager.batch_geocode_addresses(addresses) # type: ignore ) for data_point in parsed_data: @@ -123,6 +123,6 @@ def parse_data(self, sf_data: dict) -> list[dict]: try: soft_story_properties = handler.fetch_data() soft_story_property_objects = handler.parse_data(soft_story_properties) - handler.bulk_insert_data_autoincremented(soft_story_property_objects) + handler.bulk_insert_data(soft_story_property_objects, "property_address") except HTTPException as e: print(f"Failed after retries: {e}") From a79e6d4554caf0b0c1a3f3697f776aa75e403a11 Mon Sep 17 00:00:00 2001 From: Anna Date: Mon, 20 Jan 2025 13:31:07 -0800 Subject: [PATCH 7/8] removed on push trigger --- .github/workflows/etl_to_neon.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/etl_to_neon.yml b/.github/workflows/etl_to_neon.yml index dde74b33..07820257 100644 --- a/.github/workflows/etl_to_neon.yml +++ b/.github/workflows/etl_to_neon.yml @@ -4,9 +4,6 @@ name: ETL to neon on: schedule: - cron: "0 2 * * 0" # Runs at 2am UTC every Sunday - push: - branches: - - add_etl_workflow workflow_dispatch: # Allows manual triggering of the workflow jobs: From d05bb677721e809e0e80de96c36e913ac964f36c Mon Sep 17 00:00:00 2001 From: agennadi Date: Mon, 20 Jan 2025 14:43:33 -0800 Subject: [PATCH 8/8] Update a comment in liquefaction_data_handler.py --- backend/etl/liquefaction_data_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/etl/liquefaction_data_handler.py b/backend/etl/liquefaction_data_handler.py index 816c3ecc..1edc1288 100644 --- a/backend/etl/liquefaction_data_handler.py +++ b/backend/etl/liquefaction_data_handler.py @@ -24,7 +24,7 @@ def parse_data(self, data: dict) -> list[dict]: Geometry data is converted into a GeoAlchemy-compatible MultiPolygon with srid 4326. - To avoid API timeout errors, this method reduces the complexity of the multipolygons by collapsing points that are closer than 0.00001 degrees into a single point. + To avoid API timeout errors, this method reduces the complexity of the multipolygons by collapsing points that are closer than 0.0001 degrees into a single point. Note that the dataset contains the length and area of the original multipoygons. """ tolerance = 0.0001