Skip to content

Commit

Permalink
Load data into Vercel using GitHub Actions (#161)
Browse files Browse the repository at this point in the history
added an etl github action workflow
  • Loading branch information
agennadi authored Jan 21, 2025
1 parent 242215d commit ec59e65
Show file tree
Hide file tree
Showing 10 changed files with 65 additions and 19 deletions.
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ POSTGRES_DB=qsdatabase
POSTGIS_VERSION=3.5.0

# Backend Environment Variables
NEON_URL=dummy-neon-url
FRONTEND_HOST=http://localhost:3000
DATABASE_URL=postgresql://postgres:password@db:5432/qsdatabase # Connection string for the PostgreSQL database
LOCALHOST_DATABASE_URL=postgresql://postgres:password@localhost:5432/qsdatabase #Connection string for the PostgreSQL database when running locally
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/env_vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ jobs:
envkey_POSTGRES_DB: ${{ secrets.POSTGRES_DB }}
envkey_POSTGIS_VERSION: ${{ secrets.POSTGIS_VERSION }}

envkey_NEON_URL: ${{ secrets.NEON_URL }}
envkey_FRONTEND_HOST: ${{ secrets.FRONTEND_HOST }}
envkey_DATABASE_URL: ${{ secrets.DATABASE_URL }}
envkey_LOCALHOST_DATABASE_URL: ${{ secrets.LOCALHOST_DATABASE_URL }}
Expand Down
45 changes: 45 additions & 0 deletions .github/workflows/etl_to_neon.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: ETL to neon

# Workflow triggers
on:
schedule:
- cron: "0 2 * * 0" # Runs at 2am UTC every Sunday
workflow_dispatch: # Allows manual triggering of the workflow

jobs:
neon-etl:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.12"

- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Get Run ID of Most Recent Successful Run
id: get_run_id
run: |
response=$(curl -s -H "Authorization: token ${{ secrets.GH_PAT }}" \
"https://api.github.com/repos/sfbrigade/datasci-earthquake/actions/workflows/env_vars.yml/runs?status=completed&conclusion=success")
run_id=$(echo $response | jq '.workflow_runs[0].id')
echo "Run ID: $run_id"
echo "run_id=$run_id" >> $GITHUB_ENV
- name: Download .env Artifact
uses: actions/download-artifact@v4
with:
name: env-file
github-token: ${{ secrets.GH_PAT }}
repository: sfbrigade/datasci-earthquake
run-id: ${{ env.run_id }}

- name: ETL data to Neon DB
run: |
python -m backend.etl.tsunami_data_handler
python -m backend.etl.soft_story_properties_data_handler
python -m backend.etl.liquefaction_data_handler
1 change: 1 addition & 0 deletions backend/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class Settings(BaseSettings):
postgres_db: str
postgis_version: str
frontend_host: str
neon_url: str
database_url: str
localhost_database_url: str
database_url_sqlalchemy: str
Expand Down
4 changes: 1 addition & 3 deletions backend/api/models/liquefaction_zones.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ class LiquefactionZone(Base):

__tablename__ = "liquefaction_zones"

identifier: Mapped[int] = mapped_column(
Integer, primary_key=True, autoincrement=True
)
identifier: Mapped[int] = mapped_column(String, primary_key=True)
geometry: Mapped[Geometry] = mapped_column(Geometry("MULTIPOLYGON", srid=4326))
liq: Mapped[str] = mapped_column(String)
shape_length: Mapped[float] = mapped_column(Float)
Expand Down
4 changes: 3 additions & 1 deletion backend/api/models/soft_story_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ class SoftStoryProperty(Base):
block: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)
lot: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)
parcel_number: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)
property_address: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)
property_address: Mapped[str] = mapped_column(
String(_STRING_LENGTH), nullable=True, unique=True
)
address: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=False)
tier: Mapped[int] = mapped_column(Integer, nullable=True)
status: Mapped[str] = mapped_column(String(_STRING_LENGTH), nullable=True)
Expand Down
2 changes: 1 addition & 1 deletion backend/database/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from backend.api.config import settings

# Set up the database engine using settings
engine = create_engine(settings.database_url_sqlalchemy, echo=True)
engine = create_engine(settings.neon_url, echo=True)

# Create a session factory
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Expand Down
12 changes: 0 additions & 12 deletions backend/etl/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,3 @@ def bulk_insert_data(self, data_dicts: list[dict], id_field: str):
stmt = stmt.on_conflict_do_nothing(index_elements=[id_field])
db.execute(stmt)
db.commit()

def bulk_insert_data_autoincremented(self, data_dicts: list[dict]):
"""
Inserts the list of dictionaries with SQLAlchemy-generated IDs into the database table as
SQLAlchemy objects
"""
# TODO: Implement logic to upsert only changed data
with next(get_db()) as db:
stmt = pg_insert(self.table).values(data_dicts)
stmt = stmt.on_conflict_do_nothing()
db.execute(stmt)
db.commit()
9 changes: 9 additions & 0 deletions backend/etl/liquefaction_data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from backend.api.models.liquefaction_zones import LiquefactionZone
from shapely.geometry import shape
from geoalchemy2.shape import from_shape
from geoalchemy2.functions import ST_Simplify


_LIQUEFACTION_URL = "https://data.sfgov.org/resource/i4t7-35u3.geojson"
Expand All @@ -22,15 +23,23 @@ def parse_data(self, data: dict) -> list[dict]:
Each dictionary represents a row for the database table.
Geometry data is converted into a GeoAlchemy-compatible
MultiPolygon with srid 4326.
To avoid API timeout errors, this method reduces the complexity of the multipolygons by collapsing points that are closer than 0.0001 degrees into a single point.
Note that the dataset contains the length and area of the original multipoygons.
"""
tolerance = 0.0001
features = data["features"]
parsed_data = []
for feature in features:
properties = feature.get("properties", {})
geometry = feature.get("geometry", {})
multipolygon = shape(geometry)
geoalchemy_multipolygon = from_shape(multipolygon, srid=4326)
simplified_geoalchemy_multipolygon = ST_Simplify(
geoalchemy_multipolygon, tolerance
)
liquefaction_zone = {
"identifier": f'{properties.get("shape_leng")}-{properties.get("shape_area")}-{properties.get("liq")}',
"liq": properties.get("liq"),
"geometry": geoalchemy_multipolygon,
"shape_length": properties.get("shape_leng"),
Expand Down
5 changes: 3 additions & 2 deletions backend/etl/soft_story_properties_data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sqlalchemy.ext.declarative import DeclarativeMeta
from dotenv import load_dotenv
import os
from typing import Dict, Tuple
from etl.mapbox_geojson_manager import MapboxGeojsonManager
from backend.api.models.base import ModelType

Expand Down Expand Up @@ -33,7 +34,7 @@ def fill_in_missing_mapbox_points(
return parsed_data

mapbox_coordinates_map: Dict[str, Tuple[float, float]] = (
self.mapbox_geojson_manager.batch_geocode_addresses(addresses)
self.mapbox_geojson_manager.batch_geocode_addresses(addresses) # type: ignore
)

for data_point in parsed_data:
Expand Down Expand Up @@ -124,6 +125,6 @@ def parse_data(self, sf_data: dict) -> list[dict]:
try:
soft_story_properties = handler.fetch_data()
soft_story_property_objects = handler.parse_data(soft_story_properties)
handler.bulk_insert_data_autoincremented(soft_story_property_objects)
handler.bulk_insert_data(soft_story_property_objects, "property_address")
except HTTPException as e:
print(f"Failed after retries: {e}")

0 comments on commit ec59e65

Please sign in to comment.