From 3a313a0267cb5b69885b247e2a73b5e776332327 Mon Sep 17 00:00:00 2001
From: Giulio Frasca <gfrasca@redhat.com>
Date: Tue, 19 Mar 2024 14:01:17 -0400
Subject: [PATCH] Simplfy parameters in Iris example pipeline

---
 .../sample-pipeline/sample-pipeline.yaml.tmpl | 99 ++++++-------------
 docs/example_pipelines/iris/iris-pipeline.py  | 13 +--
 .../example_pipelines/iris/iris-pipeline.yaml | 58 ++++-------
 3 files changed, 50 insertions(+), 120 deletions(-)

diff --git a/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl b/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl
index c4e9f1508..3fee8d3a0 100644
--- a/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl
+++ b/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl
@@ -12,11 +12,8 @@ data:
       # PIPELINE DEFINITION
       # Name: iris-training-pipeline
       # Inputs:
-      #    min_max_scaler: bool
       #    neighbors: int
       #    standard_scaler: bool
-      # Outputs:
-      #    train-model-metrics: system.ClassificationMetrics
       components:
         comp-create-dataset:
           executorLabel: exec-create-dataset
@@ -35,8 +32,6 @@ data:
                   schemaTitle: system.Dataset
                   schemaVersion: 0.0.1
             parameters:
-              min_max_scaler:
-                parameterType: BOOLEAN
               standard_scaler:
                 parameterType: BOOLEAN
           outputDefinitions:
@@ -58,10 +53,6 @@ data:
                 parameterType: NUMBER_INTEGER
           outputDefinitions:
             artifacts:
-              metrics:
-                artifactType:
-                  schemaTitle: system.ClassificationMetrics
-                  schemaVersion: 0.0.1
               model:
                 artifactType:
                   schemaTitle: system.Model
@@ -72,7 +63,7 @@ data:
             container:
               args:
               - --executor_input
-              - '{{"{{"}}${{"}}"}}'
+              - '{{$}}'
               - --function_to_execute
               - create_dataset
               command:
@@ -80,8 +71,8 @@ data:
               - -c
               - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
                 \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-                \ python3 -m pip install --quiet     --no-warn-script-location 'kfp==2.0.1'\
-                \ && \"$0\" \"$@\"\n"
+                \ python3 -m pip install --quiet     --no-warn-script-location 'pandas==2.2.0'\
+                \ 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n"
               - sh
               - -ec
               - 'program_path=$(mktemp -d)
@@ -97,12 +88,12 @@ data:
                 \    col_names = [\n        'Sepal_Length', 'Sepal_Width', 'Petal_Length',\
                 \ 'Petal_Width', 'Labels'\n    ]\n    df = pd.read_csv(csv_url, names=col_names)\n\
                 \n    with open(iris_dataset.path, 'w') as f:\n        df.to_csv(f)\n\n"
-              image: quay.io/hukhan/iris-base:1
+              image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
           exec-normalize-dataset:
             container:
               args:
               - --executor_input
-              - '{{"{{"}}${{"}}"}}'
+              - '{{$}}'
               - --function_to_execute
               - normalize_dataset
               command:
@@ -110,8 +101,8 @@ data:
               - -c
               - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
                 \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-                \ python3 -m pip install --quiet     --no-warn-script-location 'kfp==2.0.1'\
-                \ && \"$0\" \"$@\"\n"
+                \ python3 -m pip install --quiet     --no-warn-script-location 'pandas==2.2.0'\
+                \ 'scikit-learn==1.4.0' 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n"
               - sh
               - -ec
               - 'program_path=$(mktemp -d)
@@ -124,23 +115,19 @@ data:
               - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
                 \ *\n\ndef normalize_dataset(\n    input_iris_dataset: Input[Dataset],\n\
                 \    normalized_iris_dataset: Output[Dataset],\n    standard_scaler: bool,\n\
-                \    min_max_scaler: bool,\n):\n    if standard_scaler is min_max_scaler:\n\
-                \        raise ValueError(\n            'Exactly one of standard_scaler\
-                \ or min_max_scaler must be True.')\n\n    import pandas as pd\n    from\
-                \ sklearn.preprocessing import MinMaxScaler\n    from sklearn.preprocessing\
-                \ import StandardScaler\n\n    with open(input_iris_dataset.path) as f:\n\
-                \        df = pd.read_csv(f)\n    labels = df.pop('Labels')\n\n    if standard_scaler:\n\
-                \        scaler = StandardScaler()\n    if min_max_scaler:\n        scaler\
-                \ = MinMaxScaler()\n\n    df = pd.DataFrame(scaler.fit_transform(df))\n\
-                \    df['Labels'] = labels\n    normalized_iris_dataset.metadata['state']\
-                \ = \"Normalized\"\n    with open(normalized_iris_dataset.path, 'w') as\
-                \ f:\n        df.to_csv(f)\n\n"
-              image: quay.io/hukhan/iris-base:1
+                ):\n\n    import pandas as pd\n    from sklearn.preprocessing import MinMaxScaler\n\
+                \    from sklearn.preprocessing import StandardScaler\n\n    with open(input_iris_dataset.path)\
+                \ as f:\n        df = pd.read_csv(f)\n    labels = df.pop('Labels')\n\n\
+                \    scaler = StandardScaler() if standard_scaler else MinMaxScaler()\n\n\
+                \    df = pd.DataFrame(scaler.fit_transform(df))\n    df['Labels'] = labels\n\
+                \    normalized_iris_dataset.metadata['state'] = \"Normalized\"\n    with\
+                \ open(normalized_iris_dataset.path, 'w') as f:\n        df.to_csv(f)\n\n"
+              image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
           exec-train-model:
             container:
               args:
               - --executor_input
-              - '{{"{{"}}${{"}}"}}'
+              - '{{$}}'
               - --function_to_execute
               - train_model
               command:
@@ -148,8 +135,8 @@ data:
               - -c
               - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
                 \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-                \ python3 -m pip install --quiet     --no-warn-script-location 'kfp==2.0.1'\
-                \ && \"$0\" \"$@\"\n"
+                \ python3 -m pip install --quiet     --no-warn-script-location 'pandas==2.2.0'\
+                \ 'scikit-learn==1.4.0' 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n"
               - sh
               - -ec
               - 'program_path=$(mktemp -d)
@@ -161,32 +148,19 @@ data:
                 '
               - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
                 \ *\n\ndef train_model(\n    normalized_iris_dataset: Input[Dataset],\n\
-                \    model: Output[Model],\n    metrics: Output[ClassificationMetrics],\n\
-                \    n_neighbors: int,\n):\n    import pickle\n\n    import pandas as pd\n\
-                \    from sklearn.neighbors import KNeighborsClassifier\n\n    from sklearn.metrics\
-                \ import roc_curve\n    from sklearn.model_selection import train_test_split,\
-                \ cross_val_predict\n    from sklearn.metrics import confusion_matrix\n\n\
-                \n    with open(normalized_iris_dataset.path) as f:\n        df = pd.read_csv(f)\n\
-                \n    y = df.pop('Labels')\n    X = df\n\n    X_train, X_test, y_train,\
-                \ y_test = train_test_split(X, y, random_state=0)\n\n    clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
-                \    clf.fit(X_train, y_train)\n\n    predictions = cross_val_predict(\n\
-                \        clf, X_train, y_train, cv=3)\n    metrics.log_confusion_matrix(\n\
-                \        ['Iris-Setosa', 'Iris-Versicolour', 'Iris-Virginica'],\n      \
-                \  confusion_matrix(\n            y_train,\n            predictions).tolist()\
-                \  # .tolist() to convert np array to list.\n    )\n\n    model.metadata['framework']\
-                \ = 'scikit-learn'\n    with open(model.path, 'wb') as f:\n        pickle.dump(clf,\
-                \ f)\n\n"
-              image: quay.io/hukhan/iris-base:1
+                \    model: Output[Model],\n    n_neighbors: int,\n):\n    import pickle\n\
+                \n    import pandas as pd\n    from sklearn.model_selection import train_test_split\n\
+                \    from sklearn.neighbors import KNeighborsClassifier\n\n    with open(normalized_iris_dataset.path)\
+                \ as f:\n        df = pd.read_csv(f)\n\n    y = df.pop('Labels')\n    X\
+                \ = df\n\n    X_train, X_test, y_train, y_test = train_test_split(X, y,\
+                \ random_state=0)\n\n    clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
+                \    clf.fit(X_train, y_train)\n\n    model.metadata['framework'] = 'scikit-learn'\n\
+                \    with open(model.path, 'wb') as f:\n        pickle.dump(clf, f)\n\n"
+              image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
       pipelineInfo:
         name: iris-training-pipeline
       root:
         dag:
-          outputs:
-            artifacts:
-              train-model-metrics:
-                artifactSelectors:
-                - outputArtifactKey: metrics
-                  producerSubtask: train-model
           tasks:
             create-dataset:
               cachingOptions:
@@ -209,12 +183,9 @@ data:
                       outputArtifactKey: iris_dataset
                       producerTask: create-dataset
                 parameters:
-                  min_max_scaler:
-                    runtimeValue:
-                      constant: false
                   standard_scaler:
                     runtimeValue:
-                      constant: true
+                      constant: 1.0
               taskInfo:
                 name: normalize-dataset
             train-model:
@@ -237,23 +208,13 @@ data:
                 name: train-model
         inputDefinitions:
           parameters:
-            min_max_scaler:
-              defaultValue: true
-              parameterType: BOOLEAN
             neighbors:
-              defaultValue: 3
               parameterType: NUMBER_INTEGER
             standard_scaler:
-              defaultValue: false
               parameterType: BOOLEAN
-        outputDefinitions:
-          artifacts:
-            train-model-metrics:
-              artifactType:
-                schemaTitle: system.ClassificationMetrics
-                schemaVersion: 0.0.1
       schemaVersion: 2.1.0
-      sdkVersion: kfp-2.0.1
+      sdkVersion: kfp-2.0.0-beta.13
+
 {{ else }}
 apiVersion: v1
 kind: ConfigMap
diff --git a/docs/example_pipelines/iris/iris-pipeline.py b/docs/example_pipelines/iris/iris-pipeline.py
index 2122c25e2..04165b5dd 100644
--- a/docs/example_pipelines/iris/iris-pipeline.py
+++ b/docs/example_pipelines/iris/iris-pipeline.py
@@ -33,11 +33,7 @@ def normalize_dataset(
     input_iris_dataset: Input[Dataset],
     normalized_iris_dataset: Output[Dataset],
     standard_scaler: bool,
-    min_max_scaler: bool,
 ):
-    if standard_scaler is min_max_scaler:
-        raise ValueError(
-            'Exactly one of standard_scaler or min_max_scaler must be True.')
 
     import pandas as pd
     from sklearn.preprocessing import MinMaxScaler
@@ -47,10 +43,7 @@ def normalize_dataset(
         df = pd.read_csv(f)
     labels = df.pop('Labels')
 
-    if standard_scaler:
-        scaler = StandardScaler()
-    if min_max_scaler:
-        scaler = MinMaxScaler()
+    scaler = StandardScaler() if standard_scaler else MinMaxScaler()
 
     df = pd.DataFrame(scaler.fit_transform(df))
     df['Labels'] = labels
@@ -93,15 +86,13 @@ def train_model(
 @dsl.pipeline(name='iris-training-pipeline')
 def my_pipeline(
     standard_scaler: bool,
-    min_max_scaler: bool,
     neighbors: int,
 ):
     create_dataset_task = create_dataset()
 
     normalize_dataset_task = normalize_dataset(
         input_iris_dataset=create_dataset_task.outputs['iris_dataset'],
-        standard_scaler=True,
-        min_max_scaler=False)
+        standard_scaler=True)
 
     train_model(
         normalized_iris_dataset=normalize_dataset_task
diff --git a/docs/example_pipelines/iris/iris-pipeline.yaml b/docs/example_pipelines/iris/iris-pipeline.yaml
index 9246b391f..937255d4e 100644
--- a/docs/example_pipelines/iris/iris-pipeline.yaml
+++ b/docs/example_pipelines/iris/iris-pipeline.yaml
@@ -1,7 +1,6 @@
 # PIPELINE DEFINITION
 # Name: iris-training-pipeline
 # Inputs:
-#    min_max_scaler: bool
 #    neighbors: int
 #    standard_scaler: bool
 components:
@@ -22,8 +21,6 @@ components:
             schemaTitle: system.Dataset
             schemaVersion: 0.0.1
       parameters:
-        min_max_scaler:
-          parameterType: BOOLEAN
         standard_scaler:
           parameterType: BOOLEAN
     outputDefinitions:
@@ -63,18 +60,15 @@ deploymentSpec:
         - -c
         - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
           \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\
-          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
-          \  python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
-          \ && \"$0\" \"$@\"\n"
+          \ python3 -m pip install --quiet     --no-warn-script-location 'pandas==2.2.0'\
+          \ 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n"
         - sh
         - -ec
         - 'program_path=$(mktemp -d)
 
-
           printf "%s" "$0" > "$program_path/ephemeral_component.py"
 
-          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+          python3 -m kfp.components.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
 
           '
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
@@ -96,34 +90,27 @@ deploymentSpec:
         - -c
         - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
           \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\
-          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
-          \  python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
-          \ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
+          \ python3 -m pip install --quiet     --no-warn-script-location 'pandas==2.2.0'\
+          \ 'scikit-learn==1.4.0' 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n"
         - sh
         - -ec
         - 'program_path=$(mktemp -d)
 
-
           printf "%s" "$0" > "$program_path/ephemeral_component.py"
 
-          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+          python3 -m kfp.components.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
 
           '
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
           \ *\n\ndef normalize_dataset(\n    input_iris_dataset: Input[Dataset],\n\
           \    normalized_iris_dataset: Output[Dataset],\n    standard_scaler: bool,\n\
-          \    min_max_scaler: bool,\n):\n    if standard_scaler is min_max_scaler:\n\
-          \        raise ValueError(\n            'Exactly one of standard_scaler\
-          \ or min_max_scaler must be True.')\n\n    import pandas as pd\n    from\
-          \ sklearn.preprocessing import MinMaxScaler\n    from sklearn.preprocessing\
-          \ import StandardScaler\n\n    with open(input_iris_dataset.path) as f:\n\
-          \        df = pd.read_csv(f)\n    labels = df.pop('Labels')\n\n    if standard_scaler:\n\
-          \        scaler = StandardScaler()\n    if min_max_scaler:\n        scaler\
-          \ = MinMaxScaler()\n\n    df = pd.DataFrame(scaler.fit_transform(df))\n\
-          \    df['Labels'] = labels\n    normalized_iris_dataset.metadata['state']\
-          \ = \"Normalized\"\n    with open(normalized_iris_dataset.path, 'w') as\
-          \ f:\n        df.to_csv(f)\n\n"
+          ):\n\n    import pandas as pd\n    from sklearn.preprocessing import MinMaxScaler\n\
+          \    from sklearn.preprocessing import StandardScaler\n\n    with open(input_iris_dataset.path)\
+          \ as f:\n        df = pd.read_csv(f)\n    labels = df.pop('Labels')\n\n\
+          \    scaler = StandardScaler() if standard_scaler else MinMaxScaler()\n\n\
+          \    df = pd.DataFrame(scaler.fit_transform(df))\n    df['Labels'] = labels\n\
+          \    normalized_iris_dataset.metadata['state'] = \"Normalized\"\n    with\
+          \ open(normalized_iris_dataset.path, 'w') as f:\n        df.to_csv(f)\n\n"
         image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
     exec-train-model:
       container:
@@ -137,18 +124,15 @@ deploymentSpec:
         - -c
         - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
           \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\
-          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
-          \  python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
-          \ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
+          \ python3 -m pip install --quiet     --no-warn-script-location 'pandas==2.2.0'\
+          \ 'scikit-learn==1.4.0' 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n"
         - sh
         - -ec
         - 'program_path=$(mktemp -d)
 
-
           printf "%s" "$0" > "$program_path/ephemeral_component.py"
 
-          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+          python3 -m kfp.components.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
 
           '
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
@@ -161,7 +145,6 @@ deploymentSpec:
           \ random_state=0)\n\n    clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
           \    clf.fit(X_train, y_train)\n\n    model.metadata['framework'] = 'scikit-learn'\n\
           \    with open(model.path, 'wb') as f:\n        pickle.dump(clf, f)\n\n"
-
         image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
 pipelineInfo:
   name: iris-training-pipeline
@@ -189,12 +172,9 @@ root:
                 outputArtifactKey: iris_dataset
                 producerTask: create-dataset
           parameters:
-            min_max_scaler:
-              runtimeValue:
-                constant: false
             standard_scaler:
               runtimeValue:
-                constant: true
+                constant: 1.0
         taskInfo:
           name: normalize-dataset
       train-model:
@@ -217,11 +197,9 @@ root:
           name: train-model
   inputDefinitions:
     parameters:
-      min_max_scaler:
-        parameterType: BOOLEAN
       neighbors:
         parameterType: NUMBER_INTEGER
       standard_scaler:
         parameterType: BOOLEAN
 schemaVersion: 2.1.0
-sdkVersion: kfp-2.7.0
+sdkVersion: kfp-2.0.0-beta.13