Skip to content

Commit

Permalink
feat(airflow): Add Apache Airflow to the pipeline (#74)
Browse files Browse the repository at this point in the history
* feat(airflow): Add Apache Airflow to the pipeline
  • Loading branch information
xmnlab authored Aug 24, 2022
1 parent ea4e7b5 commit 656d497
Show file tree
Hide file tree
Showing 28 changed files with 1,990 additions and 305 deletions.
25 changes: 23 additions & 2 deletions .env.tpl
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
_AIRFLOW_DB_UPGRADE="true"
_AIRFLOW_WWW_USER_CREATE="true"
_AIRFLOW_WWW_USER_USERNAME=${_AIRFLOW_WWW_USER_USERNAME}
_AIRFLOW_WWW_USER_PASSWORD=${_AIRFLOW_WWW_USER_PASSWORD}
_AIRFLOW_WWW_USER_EMAIL=${_AIRFLOW_WWW_USER_EMAIL}
_AIRFLOW_WWW_USER_FIRST_NAME=${_AIRFLOW_WWW_USER_FIRST_NAME}
_AIRFLOW_WWW_USER_LAST_NAME=${_AIRFLOW_WWW_USER_LAST_NAME}
_PIP_ADDITIONAL_REQUIREMENTS=${_PIP_ADDITIONAL_REQUIREMENTS}
AIRFLOW__API__AUTH_BACKENDS="airflow.api.auth.backend.basic_auth"
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=${AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION}
AIRFLOW__CORE__FERNET_KEY=${AIRFLOW__CORE__FERNET_KEY}
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=${AIRFLOW__DATABASE__SQL_ALCHEMY_CONN}
AIRFLOW__WEBSERVER__SECRET_KEY=${AIRFLOW__WEBSERVER__SECRET_KEY}
AIRFLOW_UID=${AIRFLOW_UID}
AIRFLOW_PORT=${AIRFLOW_PORT}
AIRFLOW_FILES_PATH_DIR_HOST=${AIRFLOW_FILES_PATH_DIR_HOST}
ANSIBLE_VAULT_KEY=${ANSIBLE_VAULT_KEY}
ENV=${ENV}
FLOWER_PORT=${FLOWER_PORT}
Expand All @@ -8,8 +24,11 @@ POSTGRES_PORT=${POSTGRES_PORT}
POSTGRES_USER=${POSTGRES_USER}
POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
POSTGRES_DB=${POSTGRES_DB}
POSTGRES_DB_PRIVATE=${POSTGRES_DB_PRIVATE}
POSTGRES_DB_SANDBOX=${POSTGRES_DB_SANDBOX}
POSTGRES_EPIGRAPH_USER=${POSTGRES_EPIGRAPH_USER}
POSTGRES_EPIGRAPH_PASSWORD=${POSTGRES_EPIGRAPH_PASSWORD}
POSTGRES_EPIGRAPH_DB=${POSTGRES_EPIGRAPH_DB}
POSTGRES_EPIGRAPH_DB_PRIVATE=${POSTGRES_DB_EPIGRAPH_PRIVATE}
POSTGRES_EPIGRAPH_DB_SANDBOX=${POSTGRES_DB_EPIGRAPH_SANDBOX}
RECAPTCHA_PUBLIC_KEY=${RECAPTCHA_PUBLIC_KEY}
RECAPTCHA_PRIVATE_KEY=${RECAPTCHA_PRIVATE_KEY}
REDIS_HOST=${REDIS_HOST}
Expand All @@ -27,3 +46,5 @@ SUPERSET_MAIL_PORT=${SUPERSET_MAIL_PORT:-587}
SUPERSET_MAIL_USERNAME=${SUPERSET_MAIL_USERNAME}
SUPERSET_MAIL_PASSWORD=${SUPERSET_MAIL_PASSWORD}
SUPERSET_MAIL_DEFAULT_SENDER=${SUPERSET_MAIL_DEFAULT_SENDER}
HOST_UID=${HOST_UID}
HOST_GID=${HOST_GID}
47 changes: 33 additions & 14 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,34 @@ on:
branches: [ main ]

env:
AIRFLOW_FILES_PATH_DIR_HOST: /tmp/airflow
AIRFLOW_HOME: /opt/airflow
AIRFLOW_PORT: 8099
AIRFLOW__API__AUTH_BACKENDS: airflow.api.auth.backend.basic_auth
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: true
AIRFLOW__CORE__FERNET_KEY: rhZaG4rip4DrA4Z93rI9uqWVK2/8Tk+zSixmEe3ZZo8=
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://dev_airflow_user:airflow_password@postgres:25432/dev_airflow
AIRFLOW__WEBSERVER__SECRET_KEY: 79197e3bef40d1d2a6cf07cb1f4c9f54e82a3bcfc6b92ed06d081856eee2
_AIRFLOW_DB_UPGRADE: true
_AIRFLOW_WWW_USER_CREATE: true
_AIRFLOW_WWW_USER_EMAIL: [email protected]
_AIRFLOW_WWW_USER_FIRST_NAME: Super
_AIRFLOW_WWW_USER_LAST_NAME: Admin
_AIRFLOW_WWW_USER_PASSWORD: airflow
_AIRFLOW_WWW_USER_USERNAME: airflow
ENV: dev
FLOWER_PORT: 28888
FLOWER_PASSWORD: flowerpass
POSTGRES_HOST: postgres
POSTGRES_PORT: 25432
POSTGRES_USER: dev_epigraph
POSTGRES_PASSWORD: dev_epigraph
POSTGRES_DB: dev_epigraphhub
POSTGRES_DB_PRIVATE: dev_privatehub
POSTGRES_DB_SANDBOX: dev_sandbox
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
POSTGRES_EPIGRAPH_USER: dev_epigraph
POSTGRES_EPIGRAPH_PASSWORD: dev_epigraph
POSTGRES_EPIGRAPH_DB: dev_epigraphhub
POSTGRES_EPIGRAPH_DB_PRIVATE: dev_privatehub
POSTGRES_EPIGRAPH_DB_SANDBOX: dev_sandbox
REDIS_HOST: redis
REDIS_PORT: 6379
REDIS_PASSWORD: redispass
Expand All @@ -36,7 +54,7 @@ jobs:
main:

runs-on: ubuntu-latest
timeout-minutes: 35
timeout-minutes: 55
defaults:
run:
shell: bash -l {0}
Expand All @@ -48,10 +66,12 @@ jobs:
- uses: actions/checkout@v2

- name: setup
run: make prepare-host-db
run: make prepare-host

- name: Create environment variables file
run: |
export HOST_UID=$(id -u)
export HOST_GID=$(id -g)
envsubst < .env.tpl > .env
- uses: conda-incubator/setup-miniconda@v2
Expand All @@ -64,27 +84,26 @@ jobs:
use-mamba: true
miniforge-variant: Mambaforge

- name: build and start docker containers
- name: build docker containers
run: make docker-build-services

- name: start services
run: |
make docker-build
make docker-start SERVICES=superset
make docker-start SERVICES="superset airflow"
- name: wait for the services are properly working
run: |
docker ps
make docker-wait-all
- name: prepare database
run: make docker-dev-prepare-db

- name: test cron scripts
run: |
make docker-run-cron SERVICE=superset
- name: Generate logs
if: ${{ failure() }}
run: |
make docker-logs SERVICES= > /tmp/docker-services.log
make docker-logs ARGS="--tail 1000" SERVICES="" > /tmp/docker-services.log
- name: Archive log artifacts
uses: actions/upload-artifact@v3
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,6 @@ cython_debug/
Data_Collection/CRON_scripts/.DS_Store
Data_Collection/CRON_scripts/.DS_Store
Documentation/.Rhistory

# airflow
docker/airflow/files/logs/*
8 changes: 4 additions & 4 deletions Data_Collection/CRON_scripts/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

DB_HOST = os.environ.get("POSTGRES_HOST")
DB_PORT = os.environ.get("POSTGRES_PORT")
DB_USER = os.environ.get("POSTGRES_USER")
DB_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
DB_NAME = os.environ.get("POSTGRES_DB")
DB_NAME_PRIVATE = os.environ.get("POSTGRES_DB_PRIVATE")
DB_USER = os.environ.get("POSTGRES_EPIGRAPH_USER")
DB_PASSWORD = os.environ.get("POSTGRES_EPIGRAPH_PASSWORD")
DB_NAME = os.environ.get("POSTGRES_EPIGRAPH_DB")
DB_NAME_PRIVATE = os.environ.get("POSTGRES_EPIGRAPH_DB_PRIVATE")

DB_URI = (
f"postgresql://{DB_USER}:{DB_PASSWORD}"
Expand Down
52 changes: 41 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
SERVICES:=superset
SERVICES:=superset airflow
SERVICE:=superset
# options: dev, prod
ENV:=$(shell scripts/get-env-name.sh)
CONSOLE:=bash
CRON:=
ARGS:=
TIMEOUT:=90


DOCKER=docker-compose \
--env-file .env \
Expand All @@ -13,20 +16,40 @@ DOCKER=docker-compose \

# HOST

.PHONY: prepare-host-db
prepare-host-db:
bash scripts/prepare-host-db.sh
.PHONY: prepare-host
prepare-host:
bash scripts/prepare-host.sh

# DOCKER

.ONESHELL:
.PHONY:docker-pull
docker-pull:
set -e
$(DOCKER) pull ${SERVICES}

.ONESHELL:
.PHONY:docker-build
docker-build:
set -e
$(DOCKER) build ${SERVICES}

.ONESHELL:
.PHONY:docker-build-services
docker-build-services: docker-pull
set -e
$(MAKE) docker-build SERVICES="superset"
$(DOCKER) build ${SERVICES}
$(DOCKER) pull ${SERVICES}

.PHONY:docker-start
docker-start: prepare-host-db
docker-start: prepare-host
set -e
if [ "${ENV}" = "dev" ]; then \
$(DOCKER) up -d postgres; \
./docker/healthcheck.sh postgres; \
fi
$(DOCKER) up --remove-orphans -d ${SERVICES}
$(MAKE) docker-wait SERVICE=airflow

.PHONY:docker-stop
docker-stop:
Expand All @@ -45,20 +68,21 @@ docker-logs-follow:

.PHONY: docker-wait
docker-wait:
ENV=${ENV} timeout 90 ./docker/healthcheck.sh ${SERVICE}
ENV=${ENV} timeout ${TIMEOUT} ./docker/healthcheck.sh ${SERVICE}

.PHONY: docker-wait-all
docker-wait-all:
# $(MAKE) docker-wait ENV=${ENV} SERVICE="postgres"
$(MAKE) docker-wait ENV=${ENV} SERVICE="redis"
$(MAKE) docker-wait ENV=${ENV} SERVICE="flower"
$(MAKE) docker-wait ENV=${ENV} SERVICE="superset"
$(MAKE) docker-wait ENV=${ENV} SERVICE="airflow"

.PHONY:docker-dev-prepare-db
docker-dev-prepare-db:
# used for development
$(DOCKER) exec -T superset \
bash /opt/EpiGraphHub/docker/postgresql/prepare-db.sh
bash /opt/EpiGraphHub/docker/postgresql/scripts/dev/prepare-db.sh

.PHONY:docker-run-cron
docker-run-cron:
Expand Down Expand Up @@ -86,9 +110,15 @@ docker-get-ips:
docker-console:
$(DOCKER) exec ${SERVICE} ${CONSOLE}

.PHONY:docker-run-bash
docker-run-bash:
$(DOCKER) run --rm ${SERVICE} bash
.PHONY:docker-run-console
docker-run-console:
$(DOCKER) run --rm ${SERVICE} ${CONSOLE}


.PHONY:docker-down
docker-down:
$(DOCKER) down --volumes --remove-orphans


# conda

Expand Down
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
## About The Project
![fig1](https://user-images.githubusercontent.com/140123/165057109-f09d6e8d-6ca2-405f-b912-7e068f20e39a.png)
EpigraphHub is an entirely open source Data integration platform focused on Epidemiological analyses.



### Built With
Expand Down Expand Up @@ -166,9 +166,9 @@ $ conda activate epigraphhub
```

### Docker

![fig2](https://user-images.githubusercontent.com/140123/165057193-c5a2b2a2-8f30-426d-9bac-8f559c01265d.png)


The project provides three **docker-compose** files, where one is the base
definition (`docker/compose-base.yaml`) and the others are one for
Expand Down Expand Up @@ -218,6 +218,14 @@ $ ./docker/healthcheck.sh epigraphhub
$ make docker-prepare-db
```

## AIRFLOW

Create a **FERNET KEY**:

```bash
openssl rand -base64 32
```

## Deployment

The deployment is executed by **Ansible** and triggered by **GitHub Actions**.
Expand Down
47 changes: 47 additions & 0 deletions conda/airflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: epigraphhub
channels:
- nodefaults
- conda-forge
- r
dependencies:
- airflow 2.3.3
- fiona
- geopandas
- gsheetsdb
- inquirer
- lightgbm
- loguru
- myst-parser
- openpyxl
- pandas
- pip
- psycopg2
- python 3.8.*
- python-dotenv
- setuptools
- sqlalchemy
# waiting for https://github.com/stanfordmlgroup/ngboost/issues/283
# - ngboost >=0.3.12
# dev
- pip
- docker-compose
# R
- r-base 3.6.*
- r-tidyverse
- r-shiny
- r-stringr
- r-readxl
- r-dplyr
- r-dbi
- r-vroom
- r-odbc
- r-rsocrata
- r-glue
- r-janitor
- r-here
- r-desctools
- r-lubridate
- r-isocodes
# - r-dm
- pip:
- -r pip.txt
Loading

0 comments on commit 656d497

Please sign in to comment.