Skip to content

Commit

Permalink
Add ClassificationMetrics to sample pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
gmfrasca committed Mar 19, 2024
1 parent e2be121 commit 4befe3b
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ data:
# Inputs:
# neighbors: int [Default: 3.0]
# standard_scaler: bool [Default: True]
# Outputs:
# train-model-metrics: system.ClassificationMetrics
components:
comp-create-dataset:
executorLabel: exec-create-dataset
Expand Down Expand Up @@ -53,6 +55,10 @@ data:
parameterType: NUMBER_INTEGER
outputDefinitions:
artifacts:
metrics:
artifactType:
schemaTitle: system.ClassificationMetrics
schemaVersion: 0.0.1
model:
artifactType:
schemaTitle: system.Model
Expand All @@ -63,7 +69,7 @@ data:
container:
args:
- --executor_input
- '{{$}}'
- '{{"{{"}}${{"}}"}}'
- --function_to_execute
- create_dataset
command:
Expand Down Expand Up @@ -96,7 +102,7 @@ data:
container:
args:
- --executor_input
- '{{$}}'
- '{{"{{"}}${{"}}"}}'
- --function_to_execute
- normalize_dataset
command:
Expand Down Expand Up @@ -133,7 +139,7 @@ data:
container:
args:
- --executor_input
- '{{$}}'
- '{{"{{"}}${{"}}"}}'
- --function_to_execute
- train_model
command:
Expand All @@ -157,19 +163,33 @@ data:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef train_model(\n normalized_iris_dataset: Input[Dataset],\n\
\ model: Output[Model],\n n_neighbors: int,\n):\n import pickle\n\
\n import pandas as pd\n from sklearn.model_selection import train_test_split\n\
\ from sklearn.neighbors import KNeighborsClassifier\n\n with open(normalized_iris_dataset.path)\
\ model: Output[Model],\n metrics: Output[ClassificationMetrics],\n\
\ n_neighbors: int,\n):\n import pickle\n\n import pandas as pd\n\
\ from sklearn.model_selection import train_test_split\n from sklearn.neighbors\
\ import KNeighborsClassifier\n\n from sklearn.metrics import roc_curve\n\
\ from sklearn.model_selection import train_test_split, cross_val_predict\n\
\ from sklearn.metrics import confusion_matrix\n\n\n with open(normalized_iris_dataset.path)\
\ as f:\n df = pd.read_csv(f)\n\n y = df.pop('Labels')\n X\
\ = df\n\n X_train, X_test, y_train, y_test = train_test_split(X, y,\
\ random_state=0)\n\n clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
\ clf.fit(X_train, y_train)\n\n model.metadata['framework'] = 'scikit-learn'\n\
\ with open(model.path, 'wb') as f:\n pickle.dump(clf, f)\n\n"
\ clf.fit(X_train, y_train)\n\n predictions = cross_val_predict(\n\
\ clf, X_train, y_train, cv=3)\n metrics.log_confusion_matrix(\n\
\ ['Iris-Setosa', 'Iris-Versicolour', 'Iris-Virginica'],\n \
\ confusion_matrix(\n y_train,\n predictions).tolist()\
\ # .tolist() to convert np array to list.\n )\n\n model.metadata['framework']\
\ = 'scikit-learn'\n with open(model.path, 'wb') as f:\n pickle.dump(clf,\
\ f)\n\n"
image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
pipelineInfo:
name: iris-training-pipeline
root:
dag:
outputs:
artifacts:
train-model-metrics:
artifactSelectors:
- outputArtifactKey: metrics
producerSubtask: train-model
tasks:
create-dataset:
cachingOptions:
Expand Down Expand Up @@ -225,6 +245,12 @@ data:
defaultValue: true
isOptional: true
parameterType: BOOLEAN
outputDefinitions:
artifacts:
train-model-metrics:
artifactType:
schemaTitle: system.ClassificationMetrics
schemaVersion: 0.0.1
schemaVersion: 2.1.0
sdkVersion: kfp-2.7.0
{{ else }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@ data:
# PIPELINE DEFINITION
# Name: iris-training-pipeline
# Inputs:
# min_max_scaler: bool
# neighbors: int
# standard_scaler: bool
# neighbors: int [Default: 3.0]
# standard_scaler: bool [Default: True]
# Outputs:
# train-model-metrics: system.ClassificationMetrics
components:
Expand All @@ -34,8 +33,6 @@ data:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
parameters:
min_max_scaler:
parameterType: BOOLEAN
standard_scaler:
parameterType: BOOLEAN
outputDefinitions:
Expand Down Expand Up @@ -79,15 +76,18 @@ data:
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.0.1'\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
\ && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)


printf "%s" "$0" > "$program_path/ephemeral_component.py"

python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"

'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
Expand All @@ -96,7 +96,7 @@ data:
\ col_names = [\n 'Sepal_Length', 'Sepal_Width', 'Petal_Length',\
\ 'Petal_Width', 'Labels'\n ]\n df = pd.read_csv(csv_url, names=col_names)\n\
\n with open(iris_dataset.path, 'w') as f:\n df.to_csv(f)\n\n"
image: quay.io/hukhan/iris-base:1
image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
exec-normalize-dataset:
container:
args:
Expand All @@ -109,32 +109,31 @@ data:
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.0.1'\
\ && \"$0\" \"$@\"\n"
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
\ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)


printf "%s" "$0" > "$program_path/ephemeral_component.py"

python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"

'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef normalize_dataset(\n input_iris_dataset: Input[Dataset],\n\
\ normalized_iris_dataset: Output[Dataset],\n standard_scaler: bool,\n\
\ min_max_scaler: bool,\n):\n if standard_scaler is min_max_scaler:\n\
\ raise ValueError(\n 'Exactly one of standard_scaler\
\ or min_max_scaler must be True.')\n\n import pandas as pd\n from\
\ sklearn.preprocessing import MinMaxScaler\n from sklearn.preprocessing\
\ import StandardScaler\n\n with open(input_iris_dataset.path) as f:\n\
\ df = pd.read_csv(f)\n labels = df.pop('Labels')\n\n if standard_scaler:\n\
\ scaler = StandardScaler()\n if min_max_scaler:\n scaler\
\ = MinMaxScaler()\n\n df = pd.DataFrame(scaler.fit_transform(df))\n\
\ df['Labels'] = labels\n normalized_iris_dataset.metadata['state']\
\ = \"Normalized\"\n with open(normalized_iris_dataset.path, 'w') as\
\ f:\n df.to_csv(f)\n\n"
image: quay.io/hukhan/iris-base:1
):\n\n import pandas as pd\n from sklearn.preprocessing import MinMaxScaler\n\
\ from sklearn.preprocessing import StandardScaler\n\n with open(input_iris_dataset.path)\
\ as f:\n df = pd.read_csv(f)\n labels = df.pop('Labels')\n\n\
\ scaler = StandardScaler() if standard_scaler else MinMaxScaler()\n\n\
\ df = pd.DataFrame(scaler.fit_transform(df))\n df['Labels'] = labels\n\
\ normalized_iris_dataset.metadata['state'] = \"Normalized\"\n with\
\ open(normalized_iris_dataset.path, 'w') as f:\n df.to_csv(f)\n\n"
image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
exec-train-model:
container:
args:
Expand All @@ -147,35 +146,39 @@ data:
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.0.1'\
\ && \"$0\" \"$@\"\n"
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\
\ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\
\ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)


printf "%s" "$0" > "$program_path/ephemeral_component.py"

python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"

'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef train_model(\n normalized_iris_dataset: Input[Dataset],\n\
\ model: Output[Model],\n metrics: Output[ClassificationMetrics],\n\
\ n_neighbors: int,\n):\n import pickle\n\n import pandas as pd\n\
\ from sklearn.neighbors import KNeighborsClassifier\n\n from sklearn.metrics\
\ import roc_curve\n from sklearn.model_selection import train_test_split,\
\ cross_val_predict\n from sklearn.metrics import confusion_matrix\n\n\
\n with open(normalized_iris_dataset.path) as f:\n df = pd.read_csv(f)\n\
\n y = df.pop('Labels')\n X = df\n\n X_train, X_test, y_train,\
\ y_test = train_test_split(X, y, random_state=0)\n\n clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
\ from sklearn.model_selection import train_test_split\n from sklearn.neighbors\
\ import KNeighborsClassifier\n\n from sklearn.metrics import roc_curve\n\
\ from sklearn.model_selection import train_test_split, cross_val_predict\n\
\ from sklearn.metrics import confusion_matrix\n\n\n with open(normalized_iris_dataset.path)\
\ as f:\n df = pd.read_csv(f)\n\n y = df.pop('Labels')\n X\
\ = df\n\n X_train, X_test, y_train, y_test = train_test_split(X, y,\
\ random_state=0)\n\n clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\
\ clf.fit(X_train, y_train)\n\n predictions = cross_val_predict(\n\
\ clf, X_train, y_train, cv=3)\n metrics.log_confusion_matrix(\n\
\ ['Iris-Setosa', 'Iris-Versicolour', 'Iris-Virginica'],\n \
\ confusion_matrix(\n y_train,\n predictions).tolist()\
\ # .tolist() to convert np array to list.\n )\n\n model.metadata['framework']\
\ = 'scikit-learn'\n with open(model.path, 'wb') as f:\n pickle.dump(clf,\
\ f)\n\n"
image: quay.io/hukhan/iris-base:1
image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0
pipelineInfo:
name: iris-training-pipeline
root:
Expand Down Expand Up @@ -208,9 +211,6 @@ data:
outputArtifactKey: iris_dataset
producerTask: create-dataset
parameters:
min_max_scaler:
runtimeValue:
constant: false
standard_scaler:
runtimeValue:
constant: true
Expand All @@ -236,14 +236,13 @@ data:
name: train-model
inputDefinitions:
parameters:
min_max_scaler:
defaultValue: true
parameterType: BOOLEAN
neighbors:
defaultValue: 3
defaultValue: 3.0
isOptional: true
parameterType: NUMBER_INTEGER
standard_scaler:
defaultValue: false
defaultValue: true
isOptional: true
parameterType: BOOLEAN
outputDefinitions:
artifacts:
Expand All @@ -252,4 +251,4 @@ data:
schemaTitle: system.ClassificationMetrics
schemaVersion: 0.0.1
schemaVersion: 2.1.0
sdkVersion: kfp-2.0.1
sdkVersion: kfp-2.7.0
18 changes: 17 additions & 1 deletion docs/example_pipelines/iris/iris-pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from kfp.dsl import Input
from kfp.dsl import Model
from kfp.dsl import Output
from kfp.dsl import ClassificationMetrics


@dsl.component(
Expand Down Expand Up @@ -59,6 +60,7 @@ def normalize_dataset(
def train_model(
normalized_iris_dataset: Input[Dataset],
model: Output[Model],
metrics: Output[ClassificationMetrics],
n_neighbors: int,
):
import pickle
Expand All @@ -67,6 +69,11 @@ def train_model(
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix


with open(normalized_iris_dataset.path) as f:
df = pd.read_csv(f)

Expand All @@ -78,6 +85,15 @@ def train_model(
clf = KNeighborsClassifier(n_neighbors=n_neighbors)
clf.fit(X_train, y_train)

predictions = cross_val_predict(
clf, X_train, y_train, cv=3)
metrics.log_confusion_matrix(
['Iris-Setosa', 'Iris-Versicolour', 'Iris-Virginica'],
confusion_matrix(
y_train,
predictions).tolist() # .tolist() to convert np array to list.
)

model.metadata['framework'] = 'scikit-learn'
with open(model.path, 'wb') as f:
pickle.dump(clf, f)
Expand All @@ -86,7 +102,7 @@ def train_model(
@dsl.pipeline(name='iris-training-pipeline')
def my_pipeline(
standard_scaler: bool = True,
neighbors: int = 3
neighbors: int = 3,
):
create_dataset_task = create_dataset()

Expand Down
Loading

0 comments on commit 4befe3b

Please sign in to comment.