Skip to content

Commit

Permalink
ci: Working on schedulers
Browse files Browse the repository at this point in the history
  • Loading branch information
Dr0p42 committed Jul 31, 2024
1 parent aecd6d9 commit 3e5f91e
Show file tree
Hide file tree
Showing 8 changed files with 311 additions and 20 deletions.
96 changes: 96 additions & 0 deletions .github/scripts/generate_schedulers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import yaml, os
import pydash as _

template_str = """
name: CI/CD Workflow
on: {}
jobs:
scheduler:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Pull Docker image
run: docker pull ghcr.io/jupyter-naas/abi/abi:latest
# - name: Run Papermill
# run: |
# docker run --name abi-execution -i --platform linux/amd64 ghcr.io/jupyter-naas/abi/abi:latest ls
# mkdir output
# docker cp abi-execution:/app/__pipeline__.ipynb ./output/__pipeline__.ipynb
# - name: Upload output artifacts
# uses: actions/upload-artifact@v4
# with:
# name: output-files
# path: ./output
"""

def generate_schedulers(config : dict, template : str):
for scheduler in config["schedulers"]:
# Skip disabled schedulers
if scheduler.get("enabled", False) is False:
continue

# Load template
cicd = yaml.safe_load(template_str)
del cicd[True]
print(cicd)

_.set_(cicd, "name", f"Scheduler - {scheduler['name']}")

cicd["on"] = {"schedule": [{"cron": scheduler["cron"]}], "workflow_dispatch": {}}


new_step = {}

new_step['name'] = scheduler['name']

new_step['run'] = f"""
# Generate unique id
export SCHEDULER_ID=$(python -c "import uuid; print(uuid.uuid4())")
# Execute the Scheduler script
docker run --name $SCHEDULER_ID -i --platform linux/amd64 ghcr.io/jupyter-naas/abi/abi:latest python .github/scripts/run_scheduler.py "{scheduler['name']}"
# Create the output directory that will be used to store the output files and save them as artifacts.
mkdir -p outputs/
# Copy the output files from the container to the host.
docker cp $SCHEDULER_ID:/app/outputs ./outputs/
"""

# Append the new step to the steps list
cicd["jobs"]["scheduler"]["steps"].append(new_step)

cicd["jobs"]["scheduler"]["steps"].append({
'name': 'Upload output artifacts',
'uses': 'actions/upload-artifact@v4',
'with': {
'name': 'output-files',
'path': './outputs'
}
})

# Write to file.
# Make sure scheduler name is a valid filename.
scheduler_name = scheduler["name"].replace(" ", "_").lower()
yaml.dump(cicd, open(os.path.join('.github/workflows', f'scheduler__{scheduler_name}.yaml'), "w"))

if __name__ == "__main__":
with open("config.yml", "r") as file:
config = yaml.safe_load(file)

generate_schedulers(config, template_str)
92 changes: 92 additions & 0 deletions .github/scripts/run_scheduler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python

# This script is used to run a scheduler defined in config.yml

import os
import sys

import papermill
import yaml
import re

class SchedulerNotFoundError(Exception):
pass


class UnknownStepTypeError(Exception):
pass

# Backing up environment variables.
environment_vars_backup: dict[str, str] = os.environ.copy()


def sanitize_string_to_filename(filename):
# Remove invalid characters
filename = re.sub(r'[\\/*?:"<>|]', '', filename)

# Replace spaces with underscores
filename = filename.replace(' ', '_')

# Remove leading/trailing whitespace
filename = filename.strip()

# Ensure filename doesn't exceed the max length
max_filename_length = 255
if len(filename) > max_filename_length:
filename = filename[:max_filename_length]

return filename.lower()

def get_scheduler(scheduler_name: str):
with open("config.yml", "r") as file:
config = yaml.safe_load(file)

for scheduler in config["schedulers"]:
if scheduler["name"] == scheduler_name:
return scheduler

raise SchedulerNotFoundError(
f"Scheduler '{scheduler_name}' not found in config.yml"
)


def reset_environment_vars():
os.environ.clear()
os.environ.update(environment_vars_backup)


def run_notebook_step(scheduler_name: str, step: dict):
reset_environment_vars()

if "environment_variables" in step:
for key, value in step["environment_variables"].items():
os.environ[key] = value

entrypoint_path = '/'.join(step["entrypoint"].split('/')[:-1])
notebook_name = step["entrypoint"].split('/')[-1]

output_path = os.path.join(f"outputs/scheduler_executions/{sanitize_string_to_filename(scheduler_name)}/{sanitize_string_to_filename(step['name'])}", entrypoint_path)
os.makedirs(output_path, exist_ok=True)

papermill.execute_notebook(
input_path=step["entrypoint"],
output_path=os.path.join(output_path, notebook_name),
parameters=step.get("inputs", {}),
)

def run_scheduler(scheduler_name: str):
scheduler = get_scheduler(scheduler_name)

for step in scheduler["steps"]:
if step.get("enabled", False) is False:
continue

if step.get("type") == "notebook":
run_notebook_step(scheduler_name, step)
else:
raise UnknownStepTypeError(f"Unknown step type: {step.get('type')}")


if __name__ == "__main__":
scheduler_name = sys.argv[1]
run_scheduler(scheduler_name)
31 changes: 31 additions & 0 deletions .github/scripts/validate_jsonschema_yaml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python

import sys
import json
import yaml
from jsonschema import validate, ValidationError

if len(sys.argv) != 3:
print("Usage: python validate_jsonschema_yaml.py <schemafile> <datafile>")
sys.exit(1)

schema_file = sys.argv[1]
data_file = sys.argv[2]

# Load JSON schema
with open(schema_file) as f:
schema = json.load(f)

# Load YAML data
with open(data_file) as f:
data = yaml.safe_load(f)

# Validate
try:
validate(instance=data, schema=schema)
except ValidationError as e:
print(e)
sys.exit(1)

print("YAML data is valid")
sys.exit(0)
41 changes: 33 additions & 8 deletions .github/workflows/scheduler__main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,40 @@ jobs:
registry: ghcr.io
username: ${{ github.actor }}
- name: Pull Docker image
run: docker pull ghcr.io/${{ github.repository }}/abi:latest
- name: Execute test/ci.ipynb
run: 'docker run --name abi-execution -i --platform linux/amd64 ghcr.io/${{
github.repository }}/abi:latest papermill tests/ci.ipynb outputs/ci.ipynb '
- name: Run hubspot
run: 'docker run --name abi-execution -i --platform linux/amd64 ghcr.io/${{
github.repository }}/abi:latest papermill tests/ci.ipynb outputs/ci.ipynb '
run: docker pull ghcr.io/jupyter-naas/abi/abi:latest
- name: main
run: '
# Generate unique id
export SCHEDULER_ID=$(python -c "import uuid; print(uuid.uuid4())")
# Execute the Scheduler script
docker run --name $SCHEDULER_ID -i --platform linux/amd64 ghcr.io/jupyter-naas/abi/abi:latest
python .github/scripts/run_scheduler.py "main"
# Create the output directory that will be used to store the output files
and save them as artifacts.
mkdir -p outputs/
# Copy the output files from the container to the host.
docker cp $SCHEDULER_ID:/app/outputs ./outputs/
'
- name: Upload output artifacts
uses: actions/upload-artifact@v4
with:
name: output-files
path: ./outputs
name: Scheduler - main
'on':
schedule:
- cron: 0 10 * * *
- cron: '*/5 * * * *'
workflow_dispatch: {}
5 changes: 4 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
{
"python.defaultInterpreterPath": ".abi-config/bin/python"
"python.defaultInterpreterPath": ".abi-config/bin/python",
"conventionalCommits.scopes": [
"CI/CD"
]
}
16 changes: 15 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ Usage:
make usage: Show this message
make build: Build the Docker image to package ABI.

make ci-generate-schedulers: Generate the scheduler files for the CI/CD
make ci-run-scheduler scheduler=<scheduler>: Run a specific scheduler


endef
export usage_str
Expand Down Expand Up @@ -120,4 +123,15 @@ windows-install-conda:
build: build.linux.x86_64

build.linux.x86_64:
docker build . -t abi -f Dockerfile.linux.x86_64 --platform linux/amd64
docker build . -t abi -f Dockerfile.linux.x86_64 --platform linux/amd64

# CI/CD
ci-generate-schedulers:
@ conda run -p .abi-conda python .github/scripts/generate_schedulers.py

ci-run-scheduler:
@ conda run -p .abi-conda python .github/scripts/run_scheduler.py $(scheduler)
# Validations

validate-config:
@ conda run -p .abi-conda python .github/scripts/validate_jsonschema_yaml.py config.schema.json config.yml
25 changes: 16 additions & 9 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,24 @@ config:

schedulers:
- name: main
enable: true
cron: 0 10 * * *
enabled: true
cron: '*/5 * * * *'
steps:
- type: Notebook
enable: true
entrypoint: __pipeline__.ipynb
- type: notebook
name: Run pipeline
enabled: true
entrypoint: tests/ci.ipynb
environment_variables:
HELLO: WORLD
CUSTOM_ENV_VAR: Hello World!
inputs:
notifications: false
steps: ["opendata", "content", "growth", "sales", "ops", "finance"]
- type: Notebook
enable: false
entrypoint: ./models/Hubspot.ipynb
message: "Hello scheduler!"
- type: notebook
name: Run Second step
enabled: true
entrypoint: tests/ci.ipynb
inputs:
notifications: false
steps: ["opendata", "content", "growth", "sales", "ops", "finance"]
message: "Hello scheduler!"
25 changes: 24 additions & 1 deletion tests/ci.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,30 @@
"metadata": {},
"outputs": [],
"source": [
"print('It works!')"
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"message = 'It works!'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(message)\n",
"print(os.environ.get('CUSTOM_ENV_VAR', 'No custom env var found'))"
]
}
],
Expand Down

0 comments on commit 3e5f91e

Please sign in to comment.