From 3e5f91ebd970bb3e8052b9f98b97d9a693c6f169 Mon Sep 17 00:00:00 2001 From: Maxime Jublou Date: Wed, 31 Jul 2024 12:00:07 +0200 Subject: [PATCH] ci: Working on schedulers --- .github/scripts/generate_schedulers.py | 96 +++++++++++++++++++++ .github/scripts/run_scheduler.py | 92 ++++++++++++++++++++ .github/scripts/validate_jsonschema_yaml.py | 31 +++++++ .github/workflows/scheduler__main.yaml | 41 +++++++-- .vscode/settings.json | 5 +- Makefile | 16 +++- config.yml | 25 ++++-- tests/ci.ipynb | 25 +++++- 8 files changed, 311 insertions(+), 20 deletions(-) create mode 100644 .github/scripts/generate_schedulers.py create mode 100644 .github/scripts/run_scheduler.py create mode 100755 .github/scripts/validate_jsonschema_yaml.py diff --git a/.github/scripts/generate_schedulers.py b/.github/scripts/generate_schedulers.py new file mode 100644 index 0000000..65ad802 --- /dev/null +++ b/.github/scripts/generate_schedulers.py @@ -0,0 +1,96 @@ +import yaml, os +import pydash as _ + +template_str = """ +name: CI/CD Workflow + +on: {} + +jobs: + scheduler: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Pull Docker image + run: docker pull ghcr.io/jupyter-naas/abi/abi:latest + + # - name: Run Papermill + # run: | + # docker run --name abi-execution -i --platform linux/amd64 ghcr.io/jupyter-naas/abi/abi:latest ls + # mkdir output + # docker cp abi-execution:/app/__pipeline__.ipynb ./output/__pipeline__.ipynb + + # - name: Upload output artifacts + # uses: actions/upload-artifact@v4 + # with: + # name: output-files + # path: ./output +""" + +def generate_schedulers(config : dict, template : str): + for scheduler in config["schedulers"]: + # Skip disabled schedulers + if scheduler.get("enabled", False) is False: + continue + + # Load template + cicd = yaml.safe_load(template_str) + del cicd[True] + print(cicd) + + _.set_(cicd, "name", f"Scheduler - {scheduler['name']}") + + cicd["on"] = {"schedule": [{"cron": scheduler["cron"]}], "workflow_dispatch": {}} + + + new_step = {} + + new_step['name'] = scheduler['name'] + + new_step['run'] = f""" +# Generate unique id +export SCHEDULER_ID=$(python -c "import uuid; print(uuid.uuid4())") + +# Execute the Scheduler script +docker run --name $SCHEDULER_ID -i --platform linux/amd64 ghcr.io/jupyter-naas/abi/abi:latest python .github/scripts/run_scheduler.py "{scheduler['name']}" + +# Create the output directory that will be used to store the output files and save them as artifacts. +mkdir -p outputs/ + +# Copy the output files from the container to the host. +docker cp $SCHEDULER_ID:/app/outputs ./outputs/ + +""" + + # Append the new step to the steps list + cicd["jobs"]["scheduler"]["steps"].append(new_step) + + cicd["jobs"]["scheduler"]["steps"].append({ + 'name': 'Upload output artifacts', + 'uses': 'actions/upload-artifact@v4', + 'with': { + 'name': 'output-files', + 'path': './outputs' + } + }) + + # Write to file. + # Make sure scheduler name is a valid filename. + scheduler_name = scheduler["name"].replace(" ", "_").lower() + yaml.dump(cicd, open(os.path.join('.github/workflows', f'scheduler__{scheduler_name}.yaml'), "w")) + +if __name__ == "__main__": + with open("config.yml", "r") as file: + config = yaml.safe_load(file) + + generate_schedulers(config, template_str) \ No newline at end of file diff --git a/.github/scripts/run_scheduler.py b/.github/scripts/run_scheduler.py new file mode 100644 index 0000000..a82dbe9 --- /dev/null +++ b/.github/scripts/run_scheduler.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +# This script is used to run a scheduler defined in config.yml + +import os +import sys + +import papermill +import yaml +import re + +class SchedulerNotFoundError(Exception): + pass + + +class UnknownStepTypeError(Exception): + pass + +# Backing up environment variables. +environment_vars_backup: dict[str, str] = os.environ.copy() + + +def sanitize_string_to_filename(filename): + # Remove invalid characters + filename = re.sub(r'[\\/*?:"<>|]', '', filename) + + # Replace spaces with underscores + filename = filename.replace(' ', '_') + + # Remove leading/trailing whitespace + filename = filename.strip() + + # Ensure filename doesn't exceed the max length + max_filename_length = 255 + if len(filename) > max_filename_length: + filename = filename[:max_filename_length] + + return filename.lower() + +def get_scheduler(scheduler_name: str): + with open("config.yml", "r") as file: + config = yaml.safe_load(file) + + for scheduler in config["schedulers"]: + if scheduler["name"] == scheduler_name: + return scheduler + + raise SchedulerNotFoundError( + f"Scheduler '{scheduler_name}' not found in config.yml" + ) + + +def reset_environment_vars(): + os.environ.clear() + os.environ.update(environment_vars_backup) + + +def run_notebook_step(scheduler_name: str, step: dict): + reset_environment_vars() + + if "environment_variables" in step: + for key, value in step["environment_variables"].items(): + os.environ[key] = value + + entrypoint_path = '/'.join(step["entrypoint"].split('/')[:-1]) + notebook_name = step["entrypoint"].split('/')[-1] + + output_path = os.path.join(f"outputs/scheduler_executions/{sanitize_string_to_filename(scheduler_name)}/{sanitize_string_to_filename(step['name'])}", entrypoint_path) + os.makedirs(output_path, exist_ok=True) + + papermill.execute_notebook( + input_path=step["entrypoint"], + output_path=os.path.join(output_path, notebook_name), + parameters=step.get("inputs", {}), + ) + +def run_scheduler(scheduler_name: str): + scheduler = get_scheduler(scheduler_name) + + for step in scheduler["steps"]: + if step.get("enabled", False) is False: + continue + + if step.get("type") == "notebook": + run_notebook_step(scheduler_name, step) + else: + raise UnknownStepTypeError(f"Unknown step type: {step.get('type')}") + + +if __name__ == "__main__": + scheduler_name = sys.argv[1] + run_scheduler(scheduler_name) diff --git a/.github/scripts/validate_jsonschema_yaml.py b/.github/scripts/validate_jsonschema_yaml.py new file mode 100755 index 0000000..eb3ad8c --- /dev/null +++ b/.github/scripts/validate_jsonschema_yaml.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +import sys +import json +import yaml +from jsonschema import validate, ValidationError + +if len(sys.argv) != 3: + print("Usage: python validate_jsonschema_yaml.py ") + sys.exit(1) + +schema_file = sys.argv[1] +data_file = sys.argv[2] + +# Load JSON schema +with open(schema_file) as f: + schema = json.load(f) + +# Load YAML data +with open(data_file) as f: + data = yaml.safe_load(f) + +# Validate +try: + validate(instance=data, schema=schema) +except ValidationError as e: + print(e) + sys.exit(1) + +print("YAML data is valid") +sys.exit(0) diff --git a/.github/workflows/scheduler__main.yaml b/.github/workflows/scheduler__main.yaml index c72e170..aa68962 100644 --- a/.github/workflows/scheduler__main.yaml +++ b/.github/workflows/scheduler__main.yaml @@ -11,15 +11,40 @@ jobs: registry: ghcr.io username: ${{ github.actor }} - name: Pull Docker image - run: docker pull ghcr.io/${{ github.repository }}/abi:latest - - name: Execute test/ci.ipynb - run: 'docker run --name abi-execution -i --platform linux/amd64 ghcr.io/${{ - github.repository }}/abi:latest papermill tests/ci.ipynb outputs/ci.ipynb ' - - name: Run hubspot - run: 'docker run --name abi-execution -i --platform linux/amd64 ghcr.io/${{ - github.repository }}/abi:latest papermill tests/ci.ipynb outputs/ci.ipynb ' + run: docker pull ghcr.io/jupyter-naas/abi/abi:latest + - name: main + run: ' + + # Generate unique id + + export SCHEDULER_ID=$(python -c "import uuid; print(uuid.uuid4())") + + + # Execute the Scheduler script + + docker run --name $SCHEDULER_ID -i --platform linux/amd64 ghcr.io/jupyter-naas/abi/abi:latest + python .github/scripts/run_scheduler.py "main" + + + # Create the output directory that will be used to store the output files + and save them as artifacts. + + mkdir -p outputs/ + + + # Copy the output files from the container to the host. + + docker cp $SCHEDULER_ID:/app/outputs ./outputs/ + + + ' + - name: Upload output artifacts + uses: actions/upload-artifact@v4 + with: + name: output-files + path: ./outputs name: Scheduler - main 'on': schedule: - - cron: 0 10 * * * + - cron: '*/5 * * * *' workflow_dispatch: {} diff --git a/.vscode/settings.json b/.vscode/settings.json index 8471ca3..46e9102 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,6 @@ { - "python.defaultInterpreterPath": ".abi-config/bin/python" + "python.defaultInterpreterPath": ".abi-config/bin/python", + "conventionalCommits.scopes": [ + "CI/CD" + ] } diff --git a/Makefile b/Makefile index 61b0fa2..cfc47a4 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,9 @@ Usage: make usage: Show this message make build: Build the Docker image to package ABI. + make ci-generate-schedulers: Generate the scheduler files for the CI/CD + make ci-run-scheduler scheduler=: Run a specific scheduler + endef export usage_str @@ -120,4 +123,15 @@ windows-install-conda: build: build.linux.x86_64 build.linux.x86_64: - docker build . -t abi -f Dockerfile.linux.x86_64 --platform linux/amd64 \ No newline at end of file + docker build . -t abi -f Dockerfile.linux.x86_64 --platform linux/amd64 + +# CI/CD +ci-generate-schedulers: + @ conda run -p .abi-conda python .github/scripts/generate_schedulers.py + +ci-run-scheduler: + @ conda run -p .abi-conda python .github/scripts/run_scheduler.py $(scheduler) +# Validations + +validate-config: + @ conda run -p .abi-conda python .github/scripts/validate_jsonschema_yaml.py config.schema.json config.yml \ No newline at end of file diff --git a/config.yml b/config.yml index 92cf23a..4fc4295 100644 --- a/config.yml +++ b/config.yml @@ -4,17 +4,24 @@ config: schedulers: - name: main - enable: true - cron: 0 10 * * * + enabled: true + cron: '*/5 * * * *' steps: - - type: Notebook - enable: true - entrypoint: __pipeline__.ipynb + - type: notebook + name: Run pipeline + enabled: true + entrypoint: tests/ci.ipynb environment_variables: - HELLO: WORLD + CUSTOM_ENV_VAR: Hello World! inputs: notifications: false steps: ["opendata", "content", "growth", "sales", "ops", "finance"] - - type: Notebook - enable: false - entrypoint: ./models/Hubspot.ipynb + message: "Hello scheduler!" + - type: notebook + name: Run Second step + enabled: true + entrypoint: tests/ci.ipynb + inputs: + notifications: false + steps: ["opendata", "content", "growth", "sales", "ops", "finance"] + message: "Hello scheduler!" \ No newline at end of file diff --git a/tests/ci.ipynb b/tests/ci.ipynb index cc5ee88..f133f20 100644 --- a/tests/ci.ipynb +++ b/tests/ci.ipynb @@ -15,7 +15,30 @@ "metadata": {}, "outputs": [], "source": [ - "print('It works!')" + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "message = 'It works!'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(message)\n", + "print(os.environ.get('CUSTOM_ENV_VAR', 'No custom env var found'))" ] } ],