tests: cicd: reworked e2e tests to support multiple cases and added c…

…onfig/test for grouping
getindata · Nov 21, 2023 · ded9fc7 · ded9fc7
1 parent 6bbb464
commit ded9fc7
Show file tree

Hide file tree

Showing 9 changed files with 157 additions and 7 deletions.
diff --git a/.github/workflows/test_and_publish.yml b/.github/workflows/test_and_publish.yml
@@ -99,6 +99,9 @@ jobs:
   e2e_tests:
     runs-on: ubuntu-latest
     needs: [unit_tests, sonarcloud]
+    strategy:
+      matrix:
+        e2e_case: ["standard", "grouping"]
     steps:
       - uses: actions/checkout@v4
 
@@ -120,7 +123,7 @@ jobs:
         # kedro 0.18.1 is on purpose here, due to https://github.com/kedro-org/kedro-starters/issues/99
         run: |
           pip install $(find "./dist" -name "*.tar.gz")
-          kedro new --starter spaceflights --config tests/e2e/starter-config.yml --verbose
+          kedro new --starter spaceflights --config tests/e2e/${{ matrix.e2e_case }}/starter-config.yml --verbose
 
       - name: Install project dependencies
         working-directory: ./spaceflights
@@ -139,8 +142,13 @@ jobs:
           sed -i 's/\(COPY src\/requirements.txt.*\)$/\1\nCOPY kedro-vertexai.tar.gz ./g' Dockerfile
           echo "!data/01_raw" >> .dockerignore
           kedro vertexai init gid-ml-ops-sandbox europe-west4
-          mv ../tests/e2e/catalog.yml conf/base/catalog.yml
-          mv ../tests/e2e/vertexai.yml conf/base/vertexai.yml
+          cp ../tests/e2e/${{ matrix.e2e_case }}/catalog.yml conf/base/catalog.yml
+          cp ../tests/e2e/${{ matrix.e2e_case }}/vertexai.yml conf/base/vertexai.yml
+          # Introducing tagging to pipelines
+          if [[ "${{ matrix.e2e_case }}" == "grouping" ]]; then
+            mv ../tests/e2e/${{ matrix.e2e_case }}/pipeline_data_processing.py src/spaceflights/pipelines/data_processing/pipeline.py
+            mv ../tests/e2e/${{ matrix.e2e_case }}/pipeline_data_science.py src/spaceflights/pipelines/data_science/pipeline.py
+          fi
 
       - name: Prepare docker env
         uses: docker/setup-buildx-action@v3
@@ -151,14 +159,15 @@ jobs:
       - name: Build pipeline docker image
         run: |
           cd ./spaceflights
-          docker build --build-arg BASE_IMAGE=python:3.8-buster --tag kedro-vertexai-e2e:latest --load .
+          docker pull gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:${{ matrix.e2e_case }} || true
+          docker build --build-arg BASE_IMAGE=python:3.10-buster --tag kedro-vertexai-e2e:${{ matrix.e2e_case }} --load --cache-from=gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:${{ matrix.e2e_case }} .
 
       - name: Publish docker image to GCR
         uses: mattes/gce-docker-push-action@v1
         with:
           creds: ${{ secrets.GOOGLE_APPLICATION_CREDENTIALS }}
-          src: kedro-vertexai-e2e:latest
-          dst: gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:latest
+          src: kedro-vertexai-e2e:${{ matrix.e2e_case }}
+          dst: gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:${{ matrix.e2e_case }}
 
       - name: Set up GCP Credentials
         uses: google-github-actions/[email protected]
@@ -172,6 +181,7 @@ jobs:
           cd ./spaceflights
           export KEDRO_CONFIG_COMMIT_ID=$GITHUB_SHA
           kedro vertexai run-once --wait-for-completion
+
   publish:
     if: github.event.pull_request == null && github.ref == 'refs/heads/master'
     needs: [ e2e_tests, codeql ]

diff --git a/tests/e2e/catalog.yml → tests/e2e/grouping/catalog.yml b/tests/e2e/catalog.yml → tests/e2e/grouping/catalog.yml
diff --git a/tests/e2e/grouping/pipeline_data_processing.py b/tests/e2e/grouping/pipeline_data_processing.py
@@ -0,0 +1,35 @@
+from kedro.pipeline import Pipeline, node, pipeline
+
+from .nodes import (
+    create_model_input_table,
+    preprocess_companies,
+    preprocess_shuttles,
+)
+
+
+def create_pipeline(**kwargs) -> Pipeline:
+    return pipeline(
+        [
+            node(
+                func=preprocess_companies,
+                inputs="companies",
+                outputs="preprocessed_companies",
+                name="preprocess_companies_node",
+                tags=["grp:preprocessing"],
+            ),
+            node(
+                func=preprocess_shuttles,
+                inputs="shuttles",
+                outputs="preprocessed_shuttles",
+                name="preprocess_shuttles_node",
+                tags=["grp:preprocessing"],
+            ),
+            node(
+                func=create_model_input_table,
+                inputs=["preprocessed_shuttles", "preprocessed_companies", "reviews"],
+                outputs="model_input_table",
+                name="create_model_input_table_node",
+                tags=["grp:preprocessing"],
+            ),
+        ]
+    )
diff --git a/tests/e2e/grouping/pipeline_data_science.py b/tests/e2e/grouping/pipeline_data_science.py
@@ -0,0 +1,29 @@
+from kedro.pipeline import Pipeline, node, pipeline
+
+from .nodes import evaluate_model, split_data, train_model
+
+
+def create_pipeline(**kwargs) -> Pipeline:
+    return pipeline(
+        [
+            node(
+                func=split_data,
+                inputs=["model_input_table", "params:model_options"],
+                outputs=["X_train", "X_test", "y_train", "y_test"],
+                name="split_data_node",
+                tags=["grp:preprocessing"],
+            ),
+            node(
+                func=train_model,
+                inputs=["X_train", "y_train"],
+                outputs="regressor",
+                name="train_model_node",
+            ),
+            node(
+                func=evaluate_model,
+                inputs=["regressor", "X_test", "y_test"],
+                outputs=None,
+                name="evaluate_model_node",
+            ),
+        ]
+    )
diff --git a/tests/e2e/grouping/starter-config.yml b/tests/e2e/grouping/starter-config.yml
@@ -0,0 +1,3 @@
+project_name: Spaceflights VertexAI E2E Test With Grouping
+repo_name: spaceflights
+python_package: spaceflights
diff --git a/tests/e2e/grouping/vertexai.yml b/tests/e2e/grouping/vertexai.yml
@@ -0,0 +1,57 @@
+project_id: gid-ml-ops-sandbox
+region: europe-west4
+run_config:
+  # Name of the image to run as the pipeline steps
+  image: gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:grouping
+
+  # Pull policy to be used for the steps. Use Always if you push the images
+  # on the same tag, or Never if you use only local images
+  image_pull_policy: IfNotPresent
+
+  # Location of Vertex AI GCS root
+  root: gid-ml-ops-sandbox-plugin-tests/staging
+
+  # Name of the kubeflow experiment to be created
+  experiment_name: kedro-vertex-e2e-grouping
+
+  # Name of the scheduled run, templated with the schedule parameters
+  scheduled_run_name: kedro-vertex-e2e-grouping
+
+  # Optional service account to run vertex AI Pipeline with
+  service_account: [email protected]
+
+  # Optional pipeline description
+  # description: "Very Important Pipeline"  
+  grouping:
+    cls: kedro_vertexai.grouping.TagNodeGrouper
+    params:
+        tag_prefix: "grp:"
+
+  # How long to keep underlying Argo workflow (together with pods and data
+  # volume after pipeline finishes) [in seconds]. Default: 1 week
+  ttl: 604800
+
+    # Optional network configuration
+    # network:
+
+    # Name of the vpc to use for running Vertex Pipeline
+    # vpc: my-vpc
+
+    # Hosts aliases to be placed in /etc/hosts when pipeline is executed
+    # host_aliases:
+    #  - ip: 127.0.0.1
+  #    hostnames: me.local
+
+  # What Kedro pipeline should be run as the last step regardless of the
+  # pipeline status. Used to send notifications or raise the alerts
+  # on_exit_pipeline: notify_via_slack
+
+  # Optional section allowing adjustment of the resources, reservations and limits
+  # for the nodes. When not provided they're set to 500m cpu and 1024Mi memory.
+  # If you don't want to specify pipeline resources set both to None in __default__.
+  resources:
+
+    # Default settings for the nodes
+    __default__:
+      cpu: 500m
+      memory: 1024Mi
diff --git a/tests/e2e/standard/catalog.yml b/tests/e2e/standard/catalog.yml
@@ -0,0 +1,16 @@
+companies:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/companies.csv
+  layer: raw
+
+reviews:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/reviews.csv
+  layer: raw
+
+shuttles:
+  type: pandas.ExcelDataSet
+  filepath: data/01_raw/shuttles.xlsx
+  layer: raw
+  load_args:
+    engine: openpyxl
diff --git a/tests/e2e/starter-config.yml → tests/e2e/standard/starter-config.yml b/tests/e2e/starter-config.yml → tests/e2e/standard/starter-config.yml
diff --git a/tests/e2e/vertexai.yml → tests/e2e/standard/vertexai.yml b/tests/e2e/vertexai.yml → tests/e2e/standard/vertexai.yml
@@ -2,7 +2,7 @@ project_id: gid-ml-ops-sandbox
 region: europe-west4
 run_config:
   # Name of the image to run as the pipeline steps
-  image: gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:latest
+  image: gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:standard
 
   # Pull policy to be used for the steps. Use Always if you push the images
   # on the same tag, or Never if you use only local images