Add ClassificationMetrics to sample pipeline

opendatahub-io · Mar 19, 2024 · 4befe3b · 4befe3b
1 parent e2be121
commit 4befe3b
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 56 deletions.
diff --git a/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl b/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl
@@ -14,6 +14,8 @@ data:
       # Inputs:
       #    neighbors: int [Default: 3.0]
       #    standard_scaler: bool [Default: True]
+      # Outputs:
+      #    train-model-metrics: system.ClassificationMetrics
       components:
         comp-create-dataset:
           executorLabel: exec-create-dataset
@@ -53,6 +55,10 @@ data:
                 parameterType: NUMBER_INTEGER
           outputDefinitions:
             artifacts:
+              metrics:
+                artifactType:
+                  schemaTitle: system.ClassificationMetrics
+                  schemaVersion: 0.0.1
               model:
                 artifactType:
                   schemaTitle: system.Model
@@ -63,7 +69,7 @@ data:
             container:
               args:
               - --executor_input
-              - '{{$}}'
+              - '{{"{{"}}${{"}}"}}'
               - --function_to_execute
               - create_dataset
               command:
@@ -96,7 +102,7 @@ data:
             container:
               args:
               - --executor_input
-              - '{{$}}'
+              - '{{"{{"}}${{"}}"}}'
               - --function_to_execute
               - normalize_dataset
               command:
@@ -133,7 +139,7 @@ data:
             container:
               args:
               - --executor_input
-              - '{{$}}'
+              - '{{"{{"}}${{"}}"}}'
               - --function_to_execute
               - train_model
               command:
@@ -157,19 +163,33 @@ data:
                 '
               - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
                 \ *\n\ndef train_model(\n    normalized_iris_dataset: Input[Dataset],\n\
-                \    model: Output[Model],\n    n_neighbors: int,\n):\n    import pickle\n\
-                \n    import pandas as pd\n    from sklearn.model_selection import train_test_split\n\
-                \    from sklearn.neighbors import KNeighborsClassifier\n\n    with open(normalized_iris_dataset.path)\
+                \    model: Output[Model],\n    metrics: Output[ClassificationMetrics],\n\
+                \    n_neighbors: int,\n):\n    import pickle\n\n    import pandas as pd\n\
+                \    from sklearn.model_selection import train_test_split\n    from sklearn.neighbors\
+                \ import KNeighborsClassifier\n\n    from sklearn.metrics import roc_curve\n\
+                \    from sklearn.model_selection import train_test_split, cross_val_predict\n\
+                \    from sklearn.metrics import confusion_matrix\n\n\n    with open(normalized_iris_dataset.path)\
                 \ as f:\n        df = pd.read_csv(f)\n\n    y = df.pop('Labels')\n    X\
                 \ = df\n\n    X_train, X_test, y_train, y_test = train_test_split(X, y,\
                 \ random_state=0)\n\n    clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
-                \    clf.fit(X_train, y_train)\n\n    model.metadata['framework'] = 'scikit-learn'\n\
-                \    with open(model.path, 'wb') as f:\n        pickle.dump(clf, f)\n\n"
+                \    clf.fit(X_train, y_train)\n\n    predictions = cross_val_predict(\n\
+                \        clf, X_train, y_train, cv=3)\n    metrics.log_confusion_matrix(\n\
+                \        ['Iris-Setosa', 'Iris-Versicolour', 'Iris-Virginica'],\n      \
+                \  confusion_matrix(\n            y_train,\n            predictions).tolist()\
+                \  # .tolist() to convert np array to list.\n    )\n\n    model.metadata['framework']\
+                \ = 'scikit-learn'\n    with open(model.path, 'wb') as f:\n        pickle.dump(clf,\
+                \ f)\n\n"
               image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
       pipelineInfo:
         name: iris-training-pipeline
       root:
         dag:
+          outputs:
+            artifacts:
+              train-model-metrics:
+                artifactSelectors:
+                - outputArtifactKey: metrics
+                  producerSubtask: train-model
           tasks:
             create-dataset:
               cachingOptions:
@@ -225,6 +245,12 @@ data:
               defaultValue: true
               isOptional: true
               parameterType: BOOLEAN
+        outputDefinitions:
+          artifacts:
+            train-model-metrics:
+              artifactType:
+                schemaTitle: system.ClassificationMetrics
+                schemaVersion: 0.0.1
       schemaVersion: 2.1.0
       sdkVersion: kfp-2.7.0
 {{ else }}

diff --git a/controllers/testdata/declarative/case_7/expected/created/sample-pipeline.yaml.tmpl b/controllers/testdata/declarative/case_7/expected/created/sample-pipeline.yaml.tmpl
@@ -11,9 +11,8 @@ data:
       # PIPELINE DEFINITION
       # Name: iris-training-pipeline
       # Inputs:
-      #    min_max_scaler: bool
-      #    neighbors: int
-      #    standard_scaler: bool
+      #    neighbors: int [Default: 3.0]
+      #    standard_scaler: bool [Default: True]
       # Outputs:
       #    train-model-metrics: system.ClassificationMetrics
       components:
@@ -34,8 +33,6 @@ data:
                   schemaTitle: system.Dataset
                   schemaVersion: 0.0.1
             parameters:
-              min_max_scaler:
-                parameterType: BOOLEAN
               standard_scaler:
                 parameterType: BOOLEAN
           outputDefinitions:
@@ -79,15 +76,18 @@ data:
               - -c
               - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
                 \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-                \ python3 -m pip install --quiet     --no-warn-script-location 'kfp==2.0.1'\
+                \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\
+                \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+                \  python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
                 \ && \"$0\" \"$@\"\n"
               - sh
               - -ec
               - 'program_path=$(mktemp -d)
 
+
                 printf "%s" "$0" > "$program_path/ephemeral_component.py"
 
-                python3 -m kfp.components.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+                _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
 
                 '
               - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
@@ -96,7 +96,7 @@ data:
                 \    col_names = [\n        'Sepal_Length', 'Sepal_Width', 'Petal_Length',\
                 \ 'Petal_Width', 'Labels'\n    ]\n    df = pd.read_csv(csv_url, names=col_names)\n\
                 \n    with open(iris_dataset.path, 'w') as f:\n        df.to_csv(f)\n\n"
-              image: quay.io/hukhan/iris-base:1
+              image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
           exec-normalize-dataset:
             container:
               args:
@@ -109,32 +109,31 @@ data:
               - -c
               - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
                 \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-                \ python3 -m pip install --quiet     --no-warn-script-location 'kfp==2.0.1'\
-                \ && \"$0\" \"$@\"\n"
+                \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\
+                \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+                \  python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
+                \ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
               - sh
               - -ec
               - 'program_path=$(mktemp -d)
 
+
                 printf "%s" "$0" > "$program_path/ephemeral_component.py"
 
-                python3 -m kfp.components.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+                _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
 
                 '
               - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
                 \ *\n\ndef normalize_dataset(\n    input_iris_dataset: Input[Dataset],\n\
                 \    normalized_iris_dataset: Output[Dataset],\n    standard_scaler: bool,\n\
-                \    min_max_scaler: bool,\n):\n    if standard_scaler is min_max_scaler:\n\
-                \        raise ValueError(\n            'Exactly one of standard_scaler\
-                \ or min_max_scaler must be True.')\n\n    import pandas as pd\n    from\
-                \ sklearn.preprocessing import MinMaxScaler\n    from sklearn.preprocessing\
-                \ import StandardScaler\n\n    with open(input_iris_dataset.path) as f:\n\
-                \        df = pd.read_csv(f)\n    labels = df.pop('Labels')\n\n    if standard_scaler:\n\
-                \        scaler = StandardScaler()\n    if min_max_scaler:\n        scaler\
-                \ = MinMaxScaler()\n\n    df = pd.DataFrame(scaler.fit_transform(df))\n\
-                \    df['Labels'] = labels\n    normalized_iris_dataset.metadata['state']\
-                \ = \"Normalized\"\n    with open(normalized_iris_dataset.path, 'w') as\
-                \ f:\n        df.to_csv(f)\n\n"
-              image: quay.io/hukhan/iris-base:1
+                ):\n\n    import pandas as pd\n    from sklearn.preprocessing import MinMaxScaler\n\
+                \    from sklearn.preprocessing import StandardScaler\n\n    with open(input_iris_dataset.path)\
+                \ as f:\n        df = pd.read_csv(f)\n    labels = df.pop('Labels')\n\n\
+                \    scaler = StandardScaler() if standard_scaler else MinMaxScaler()\n\n\
+                \    df = pd.DataFrame(scaler.fit_transform(df))\n    df['Labels'] = labels\n\
+                \    normalized_iris_dataset.metadata['state'] = \"Normalized\"\n    with\
+                \ open(normalized_iris_dataset.path, 'w') as f:\n        df.to_csv(f)\n\n"
+              image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
           exec-train-model:
             container:
               args:
@@ -147,35 +146,39 @@ data:
               - -c
               - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
                 \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-                \ python3 -m pip install --quiet     --no-warn-script-location 'kfp==2.0.1'\
-                \ && \"$0\" \"$@\"\n"
+                \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\
+                \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"'  &&\
+                \  python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
+                \ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
               - sh
               - -ec
               - 'program_path=$(mktemp -d)
 
+
                 printf "%s" "$0" > "$program_path/ephemeral_component.py"
 
-                python3 -m kfp.components.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+                _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
 
                 '
               - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
                 \ *\n\ndef train_model(\n    normalized_iris_dataset: Input[Dataset],\n\
                 \    model: Output[Model],\n    metrics: Output[ClassificationMetrics],\n\
                 \    n_neighbors: int,\n):\n    import pickle\n\n    import pandas as pd\n\
-                \    from sklearn.neighbors import KNeighborsClassifier\n\n    from sklearn.metrics\
-                \ import roc_curve\n    from sklearn.model_selection import train_test_split,\
-                \ cross_val_predict\n    from sklearn.metrics import confusion_matrix\n\n\
-                \n    with open(normalized_iris_dataset.path) as f:\n        df = pd.read_csv(f)\n\
-                \n    y = df.pop('Labels')\n    X = df\n\n    X_train, X_test, y_train,\
-                \ y_test = train_test_split(X, y, random_state=0)\n\n    clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
+                \    from sklearn.model_selection import train_test_split\n    from sklearn.neighbors\
+                \ import KNeighborsClassifier\n\n    from sklearn.metrics import roc_curve\n\
+                \    from sklearn.model_selection import train_test_split, cross_val_predict\n\
+                \    from sklearn.metrics import confusion_matrix\n\n\n    with open(normalized_iris_dataset.path)\
+                \ as f:\n        df = pd.read_csv(f)\n\n    y = df.pop('Labels')\n    X\
+                \ = df\n\n    X_train, X_test, y_train, y_test = train_test_split(X, y,\
+                \ random_state=0)\n\n    clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
                 \    clf.fit(X_train, y_train)\n\n    predictions = cross_val_predict(\n\
                 \        clf, X_train, y_train, cv=3)\n    metrics.log_confusion_matrix(\n\
                 \        ['Iris-Setosa', 'Iris-Versicolour', 'Iris-Virginica'],\n      \
                 \  confusion_matrix(\n            y_train,\n            predictions).tolist()\
                 \  # .tolist() to convert np array to list.\n    )\n\n    model.metadata['framework']\
                 \ = 'scikit-learn'\n    with open(model.path, 'wb') as f:\n        pickle.dump(clf,\
                 \ f)\n\n"
-              image: quay.io/hukhan/iris-base:1
+              image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
       pipelineInfo:
         name: iris-training-pipeline
       root:
@@ -208,9 +211,6 @@ data:
                       outputArtifactKey: iris_dataset
                       producerTask: create-dataset
                 parameters:
-                  min_max_scaler:
-                    runtimeValue:
-                      constant: false
                   standard_scaler:
                     runtimeValue:
                       constant: true
@@ -236,14 +236,13 @@ data:
                 name: train-model
         inputDefinitions:
           parameters:
-            min_max_scaler:
-              defaultValue: true
-              parameterType: BOOLEAN
             neighbors:
-              defaultValue: 3
+              defaultValue: 3.0
+              isOptional: true
               parameterType: NUMBER_INTEGER
             standard_scaler:
-              defaultValue: false
+              defaultValue: true
+              isOptional: true
               parameterType: BOOLEAN
         outputDefinitions:
           artifacts:
@@ -252,4 +251,4 @@ data:
                 schemaTitle: system.ClassificationMetrics
                 schemaVersion: 0.0.1
       schemaVersion: 2.1.0
-      sdkVersion: kfp-2.0.1
+      sdkVersion: kfp-2.7.0
diff --git a/docs/example_pipelines/iris/iris-pipeline.py b/docs/example_pipelines/iris/iris-pipeline.py
@@ -7,6 +7,7 @@
 from kfp.dsl import Input
 from kfp.dsl import Model
 from kfp.dsl import Output
+from kfp.dsl import ClassificationMetrics
 
 
 @dsl.component(
@@ -59,6 +60,7 @@ def normalize_dataset(
 def train_model(
     normalized_iris_dataset: Input[Dataset],
     model: Output[Model],
+    metrics: Output[ClassificationMetrics],
     n_neighbors: int,
 ):
     import pickle
@@ -67,6 +69,11 @@ def train_model(
     from sklearn.model_selection import train_test_split
     from sklearn.neighbors import KNeighborsClassifier
 
+    from sklearn.metrics import roc_curve
+    from sklearn.model_selection import train_test_split, cross_val_predict
+    from sklearn.metrics import confusion_matrix
+
+
     with open(normalized_iris_dataset.path) as f:
         df = pd.read_csv(f)
 
@@ -78,6 +85,15 @@ def train_model(
     clf = KNeighborsClassifier(n_neighbors=n_neighbors)
     clf.fit(X_train, y_train)
 
+    predictions = cross_val_predict(
+        clf, X_train, y_train, cv=3)
+    metrics.log_confusion_matrix(
+        ['Iris-Setosa', 'Iris-Versicolour', 'Iris-Virginica'],
+        confusion_matrix(
+            y_train,
+            predictions).tolist()  # .tolist() to convert np array to list.
+    )
+
     model.metadata['framework'] = 'scikit-learn'
     with open(model.path, 'wb') as f:
         pickle.dump(clf, f)
@@ -86,7 +102,7 @@ def train_model(
 @dsl.pipeline(name='iris-training-pipeline')
 def my_pipeline(
     standard_scaler: bool = True,
-    neighbors: int = 3
+    neighbors: int = 3,
 ):
     create_dataset_task = create_dataset()