sfbrigade · agennadi · Jan 21, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 16, 2025
diff --git a/.env.example b/.env.example
@@ -5,6 +5,7 @@ POSTGRES_DB=qsdatabase
 POSTGIS_VERSION=3.5.0
 
 # Backend Environment Variables
+NEON_URL=dummy-neon-url
 FRONTEND_HOST=http://localhost:3000
 DATABASE_URL=postgresql://postgres:password@db:5432/qsdatabase # Connection string for the PostgreSQL database
 LOCALHOST_DATABASE_URL=postgresql://postgres:password@localhost:5432/qsdatabase  #Connection string for the PostgreSQL database when running locally

diff --git a/.github/workflows/env_vars.yml b/.github/workflows/env_vars.yml
@@ -17,6 +17,7 @@ jobs:
         envkey_POSTGRES_DB: ${{ secrets.POSTGRES_DB }}
         envkey_POSTGIS_VERSION: ${{ secrets.POSTGIS_VERSION }}
 
+        envkey_NEON_URL: ${{ secrets.NEON_URL }}
         envkey_FRONTEND_HOST: ${{ secrets.FRONTEND_HOST }}
         envkey_DATABASE_URL: ${{ secrets.DATABASE_URL }}
         envkey_LOCALHOST_DATABASE_URL: ${{ secrets.LOCALHOST_DATABASE_URL }}

diff --git a/.github/workflows/etl_to_neon.yml b/.github/workflows/etl_to_neon.yml
@@ -0,0 +1,45 @@
+name: ETL to neon
+
+# Workflow triggers
+on:
+  schedule:
+    - cron: "0 2 * * 0"  # Runs at 2am UTC every Sunday
+  workflow_dispatch:  # Allows manual triggering of the workflow
+
+jobs:
+  neon-etl:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+      - name: Get Run ID of Most Recent Successful Run
+        id: get_run_id
+        run: |
+          response=$(curl -s -H "Authorization: token ${{ secrets.GH_PAT }}" \
+          "https://api.github.com/repos/sfbrigade/datasci-earthquake/actions/workflows/env_vars.yml/runs?status=completed&conclusion=success")
+          run_id=$(echo $response | jq '.workflow_runs[0].id')
+          echo "Run ID: $run_id"
+          echo "run_id=$run_id" >> $GITHUB_ENV        
+      - name: Download .env Artifact 
+        uses: actions/download-artifact@v4
+        with:
+          name: env-file
+          github-token: ${{ secrets.GH_PAT }}
+          repository: sfbrigade/datasci-earthquake
+          run-id: ${{ env.run_id }}                  
+
+      - name: ETL data to Neon DB
+        run: |
+          python -m backend.etl.tsunami_data_handler
+          python -m backend.etl.soft_story_properties_data_handler
+          python -m backend.etl.liquefaction_data_handler
diff --git a/backend/api/config.py b/backend/api/config.py
@@ -14,6 +14,7 @@ class Settings(BaseSettings):
     postgres_db: str
     postgis_version: str
     frontend_host: str
+    neon_url: str
     database_url: str
     localhost_database_url: str
     database_url_sqlalchemy: str

diff --git a/backend/api/models/liquefaction_zones.py b/backend/api/models/liquefaction_zones.py
@@ -22,9 +22,7 @@ class LiquefactionZone(Base):
 
     __tablename__ = "liquefaction_zones"
 
-    identifier: Mapped[int] = mapped_column(
-        Integer, primary_key=True, autoincrement=True
-    )
+    identifier: Mapped[int] = mapped_column(String, primary_key=True)
     geometry: Mapped[Geometry] = mapped_column(Geometry("MULTIPOLYGON", srid=4326))
     liq: Mapped[str] = mapped_column(String)
     shape_length: Mapped[float] = mapped_column(Float)

diff --git a/backend/api/models/soft_story_properties.py b/backend/api/models/soft_story_properties.py
@@ -29,7 +29,9 @@ class SoftStoryProperty(Base):
     block: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)
     lot: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)
     parcel_number: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)
-    property_address: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)
+    property_address: Mapped[str] = mapped_column(
+        String(_STRING_LENGTH), nullable=True, unique=True
+    )
     address: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=False)
     tier: Mapped[int] = mapped_column(Integer, nullable=True)
     status: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)

diff --git a/backend/database/session.py b/backend/database/session.py
@@ -3,7 +3,7 @@
 from backend.api.config import settings
 
 # Set up the database engine using settings
-engine = create_engine(settings.database_url_sqlalchemy, echo=True)
+engine = create_engine(settings.neon_url, echo=True)
 
 # Create a session factory
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

diff --git a/backend/etl/data_handler.py b/backend/etl/data_handler.py
@@ -85,15 +85,3 @@ def bulk_insert_data(self, data_dicts: list[dict], id_field: str):
             stmt = stmt.on_conflict_do_nothing(index_elements=[id_field])
             db.execute(stmt)
             db.commit()
-
-    def bulk_insert_data_autoincremented(self, data_dicts: list[dict]):
-        """
-        Inserts the list of dictionaries with SQLAlchemy-generated IDs into the database table as
-        SQLAlchemy objects
-        """
-        # TODO: Implement logic to upsert only changed data
-        with next(get_db()) as db:
-            stmt = pg_insert(self.table).values(data_dicts)
-            stmt = stmt.on_conflict_do_nothing()
-            db.execute(stmt)
-            db.commit()
diff --git a/backend/etl/liquefaction_data_handler.py b/backend/etl/liquefaction_data_handler.py
@@ -3,6 +3,7 @@
 from backend.api.models.liquefaction_zones import LiquefactionZone
 from shapely.geometry import shape
 from geoalchemy2.shape import from_shape
+from geoalchemy2.functions import ST_Simplify
 
 
 _LIQUEFACTION_URL = "https://data.sfgov.org/resource/i4t7-35u3.geojson"
@@ -22,15 +23,23 @@ def parse_data(self, data: dict) -> list[dict]:
         Each dictionary represents a row for the database table.
         Geometry data is converted into a GeoAlchemy-compatible
         MultiPolygon with srid 4326.
+
+        To avoid API timeout errors, this method reduces the complexity of the multipolygons by collapsing points that are closer than 0.0001 degrees into a single point.
+        Note that the dataset contains the length and area of the original multipoygons.
         """
+        tolerance = 0.0001
         features = data["features"]
         parsed_data = []
         for feature in features:
             properties = feature.get("properties", {})
             geometry = feature.get("geometry", {})
             multipolygon = shape(geometry)
             geoalchemy_multipolygon = from_shape(multipolygon, srid=4326)
+            simplified_geoalchemy_multipolygon = ST_Simplify(
+                geoalchemy_multipolygon, tolerance
+            )
             liquefaction_zone = {
+                "identifier": f'{properties.get("shape_leng")}-{properties.get("shape_area")}-{properties.get("liq")}',
                 "liq": properties.get("liq"),
                 "geometry": geoalchemy_multipolygon,
                 "shape_length": properties.get("shape_leng"),

diff --git a/backend/etl/soft_story_properties_data_handler.py b/backend/etl/soft_story_properties_data_handler.py
@@ -5,6 +5,7 @@
 from sqlalchemy.ext.declarative import DeclarativeMeta
 from dotenv import load_dotenv
 import os
+from typing import Dict, Tuple
 from etl.mapbox_geojson_manager import MapboxGeojsonManager
 from backend.api.models.base import ModelType
 
@@ -33,7 +34,7 @@ def fill_in_missing_mapbox_points(
             return parsed_data
 
         mapbox_coordinates_map: Dict[str, Tuple[float, float]] = (
-            self.mapbox_geojson_manager.batch_geocode_addresses(addresses)
+            self.mapbox_geojson_manager.batch_geocode_addresses(addresses)  # type: ignore
         )
 
         for data_point in parsed_data:
@@ -124,6 +125,6 @@ def parse_data(self, sf_data: dict) -> list[dict]:
     try:
         soft_story_properties = handler.fetch_data()
         soft_story_property_objects = handler.parse_data(soft_story_properties)
-        handler.bulk_insert_data_autoincremented(soft_story_property_objects)
+        handler.bulk_insert_data(soft_story_property_objects, "property_address")
     except HTTPException as e:
         print(f"Failed after retries: {e}")