From 3a313a0267cb5b69885b247e2a73b5e776332327 Mon Sep 17 00:00:00 2001 From: Giulio Frasca Date: Tue, 19 Mar 2024 14:01:17 -0400 Subject: [PATCH] Simplfy parameters in Iris example pipeline --- .../sample-pipeline/sample-pipeline.yaml.tmpl | 99 ++++++------------- docs/example_pipelines/iris/iris-pipeline.py | 13 +-- .../example_pipelines/iris/iris-pipeline.yaml | 58 ++++------- 3 files changed, 50 insertions(+), 120 deletions(-) diff --git a/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl b/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl index c4e9f1508..3fee8d3a0 100644 --- a/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl +++ b/config/internal/apiserver/sample-pipeline/sample-pipeline.yaml.tmpl @@ -12,11 +12,8 @@ data: # PIPELINE DEFINITION # Name: iris-training-pipeline # Inputs: - # min_max_scaler: bool # neighbors: int # standard_scaler: bool - # Outputs: - # train-model-metrics: system.ClassificationMetrics components: comp-create-dataset: executorLabel: exec-create-dataset @@ -35,8 +32,6 @@ data: schemaTitle: system.Dataset schemaVersion: 0.0.1 parameters: - min_max_scaler: - parameterType: BOOLEAN standard_scaler: parameterType: BOOLEAN outputDefinitions: @@ -58,10 +53,6 @@ data: parameterType: NUMBER_INTEGER outputDefinitions: artifacts: - metrics: - artifactType: - schemaTitle: system.ClassificationMetrics - schemaVersion: 0.0.1 model: artifactType: schemaTitle: system.Model @@ -72,7 +63,7 @@ data: container: args: - --executor_input - - '{{"{{"}}${{"}}"}}' + - '{{$}}' - --function_to_execute - create_dataset command: @@ -80,8 +71,8 @@ data: - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.0.1'\ - \ && \"$0\" \"$@\"\n" + \ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\ + \ 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) @@ -97,12 +88,12 @@ data: \ col_names = [\n 'Sepal_Length', 'Sepal_Width', 'Petal_Length',\ \ 'Petal_Width', 'Labels'\n ]\n df = pd.read_csv(csv_url, names=col_names)\n\ \n with open(iris_dataset.path, 'w') as f:\n df.to_csv(f)\n\n" - image: quay.io/hukhan/iris-base:1 + image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0 exec-normalize-dataset: container: args: - --executor_input - - '{{"{{"}}${{"}}"}}' + - '{{$}}' - --function_to_execute - normalize_dataset command: @@ -110,8 +101,8 @@ data: - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.0.1'\ - \ && \"$0\" \"$@\"\n" + \ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\ + \ 'scikit-learn==1.4.0' 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) @@ -124,23 +115,19 @@ data: - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef normalize_dataset(\n input_iris_dataset: Input[Dataset],\n\ \ normalized_iris_dataset: Output[Dataset],\n standard_scaler: bool,\n\ - \ min_max_scaler: bool,\n):\n if standard_scaler is min_max_scaler:\n\ - \ raise ValueError(\n 'Exactly one of standard_scaler\ - \ or min_max_scaler must be True.')\n\n import pandas as pd\n from\ - \ sklearn.preprocessing import MinMaxScaler\n from sklearn.preprocessing\ - \ import StandardScaler\n\n with open(input_iris_dataset.path) as f:\n\ - \ df = pd.read_csv(f)\n labels = df.pop('Labels')\n\n if standard_scaler:\n\ - \ scaler = StandardScaler()\n if min_max_scaler:\n scaler\ - \ = MinMaxScaler()\n\n df = pd.DataFrame(scaler.fit_transform(df))\n\ - \ df['Labels'] = labels\n normalized_iris_dataset.metadata['state']\ - \ = \"Normalized\"\n with open(normalized_iris_dataset.path, 'w') as\ - \ f:\n df.to_csv(f)\n\n" - image: quay.io/hukhan/iris-base:1 + ):\n\n import pandas as pd\n from sklearn.preprocessing import MinMaxScaler\n\ + \ from sklearn.preprocessing import StandardScaler\n\n with open(input_iris_dataset.path)\ + \ as f:\n df = pd.read_csv(f)\n labels = df.pop('Labels')\n\n\ + \ scaler = StandardScaler() if standard_scaler else MinMaxScaler()\n\n\ + \ df = pd.DataFrame(scaler.fit_transform(df))\n df['Labels'] = labels\n\ + \ normalized_iris_dataset.metadata['state'] = \"Normalized\"\n with\ + \ open(normalized_iris_dataset.path, 'w') as f:\n df.to_csv(f)\n\n" + image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0 exec-train-model: container: args: - --executor_input - - '{{"{{"}}${{"}}"}}' + - '{{$}}' - --function_to_execute - train_model command: @@ -148,8 +135,8 @@ data: - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.0.1'\ - \ && \"$0\" \"$@\"\n" + \ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\ + \ 'scikit-learn==1.4.0' 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) @@ -161,32 +148,19 @@ data: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef train_model(\n normalized_iris_dataset: Input[Dataset],\n\ - \ model: Output[Model],\n metrics: Output[ClassificationMetrics],\n\ - \ n_neighbors: int,\n):\n import pickle\n\n import pandas as pd\n\ - \ from sklearn.neighbors import KNeighborsClassifier\n\n from sklearn.metrics\ - \ import roc_curve\n from sklearn.model_selection import train_test_split,\ - \ cross_val_predict\n from sklearn.metrics import confusion_matrix\n\n\ - \n with open(normalized_iris_dataset.path) as f:\n df = pd.read_csv(f)\n\ - \n y = df.pop('Labels')\n X = df\n\n X_train, X_test, y_train,\ - \ y_test = train_test_split(X, y, random_state=0)\n\n clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\ - \ clf.fit(X_train, y_train)\n\n predictions = cross_val_predict(\n\ - \ clf, X_train, y_train, cv=3)\n metrics.log_confusion_matrix(\n\ - \ ['Iris-Setosa', 'Iris-Versicolour', 'Iris-Virginica'],\n \ - \ confusion_matrix(\n y_train,\n predictions).tolist()\ - \ # .tolist() to convert np array to list.\n )\n\n model.metadata['framework']\ - \ = 'scikit-learn'\n with open(model.path, 'wb') as f:\n pickle.dump(clf,\ - \ f)\n\n" - image: quay.io/hukhan/iris-base:1 + \ model: Output[Model],\n n_neighbors: int,\n):\n import pickle\n\ + \n import pandas as pd\n from sklearn.model_selection import train_test_split\n\ + \ from sklearn.neighbors import KNeighborsClassifier\n\n with open(normalized_iris_dataset.path)\ + \ as f:\n df = pd.read_csv(f)\n\n y = df.pop('Labels')\n X\ + \ = df\n\n X_train, X_test, y_train, y_test = train_test_split(X, y,\ + \ random_state=0)\n\n clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\ + \ clf.fit(X_train, y_train)\n\n model.metadata['framework'] = 'scikit-learn'\n\ + \ with open(model.path, 'wb') as f:\n pickle.dump(clf, f)\n\n" + image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0 pipelineInfo: name: iris-training-pipeline root: dag: - outputs: - artifacts: - train-model-metrics: - artifactSelectors: - - outputArtifactKey: metrics - producerSubtask: train-model tasks: create-dataset: cachingOptions: @@ -209,12 +183,9 @@ data: outputArtifactKey: iris_dataset producerTask: create-dataset parameters: - min_max_scaler: - runtimeValue: - constant: false standard_scaler: runtimeValue: - constant: true + constant: 1.0 taskInfo: name: normalize-dataset train-model: @@ -237,23 +208,13 @@ data: name: train-model inputDefinitions: parameters: - min_max_scaler: - defaultValue: true - parameterType: BOOLEAN neighbors: - defaultValue: 3 parameterType: NUMBER_INTEGER standard_scaler: - defaultValue: false parameterType: BOOLEAN - outputDefinitions: - artifacts: - train-model-metrics: - artifactType: - schemaTitle: system.ClassificationMetrics - schemaVersion: 0.0.1 schemaVersion: 2.1.0 - sdkVersion: kfp-2.0.1 + sdkVersion: kfp-2.0.0-beta.13 + {{ else }} apiVersion: v1 kind: ConfigMap diff --git a/docs/example_pipelines/iris/iris-pipeline.py b/docs/example_pipelines/iris/iris-pipeline.py index 2122c25e2..04165b5dd 100644 --- a/docs/example_pipelines/iris/iris-pipeline.py +++ b/docs/example_pipelines/iris/iris-pipeline.py @@ -33,11 +33,7 @@ def normalize_dataset( input_iris_dataset: Input[Dataset], normalized_iris_dataset: Output[Dataset], standard_scaler: bool, - min_max_scaler: bool, ): - if standard_scaler is min_max_scaler: - raise ValueError( - 'Exactly one of standard_scaler or min_max_scaler must be True.') import pandas as pd from sklearn.preprocessing import MinMaxScaler @@ -47,10 +43,7 @@ def normalize_dataset( df = pd.read_csv(f) labels = df.pop('Labels') - if standard_scaler: - scaler = StandardScaler() - if min_max_scaler: - scaler = MinMaxScaler() + scaler = StandardScaler() if standard_scaler else MinMaxScaler() df = pd.DataFrame(scaler.fit_transform(df)) df['Labels'] = labels @@ -93,15 +86,13 @@ def train_model( @dsl.pipeline(name='iris-training-pipeline') def my_pipeline( standard_scaler: bool, - min_max_scaler: bool, neighbors: int, ): create_dataset_task = create_dataset() normalize_dataset_task = normalize_dataset( input_iris_dataset=create_dataset_task.outputs['iris_dataset'], - standard_scaler=True, - min_max_scaler=False) + standard_scaler=True) train_model( normalized_iris_dataset=normalize_dataset_task diff --git a/docs/example_pipelines/iris/iris-pipeline.yaml b/docs/example_pipelines/iris/iris-pipeline.yaml index 9246b391f..937255d4e 100644 --- a/docs/example_pipelines/iris/iris-pipeline.yaml +++ b/docs/example_pipelines/iris/iris-pipeline.yaml @@ -1,7 +1,6 @@ # PIPELINE DEFINITION # Name: iris-training-pipeline # Inputs: -# min_max_scaler: bool # neighbors: int # standard_scaler: bool components: @@ -22,8 +21,6 @@ components: schemaTitle: system.Dataset schemaVersion: 0.0.1 parameters: - min_max_scaler: - parameterType: BOOLEAN standard_scaler: parameterType: BOOLEAN outputDefinitions: @@ -63,18 +60,15 @@ deploymentSpec: - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\ - \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ - \ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\ - \ && \"$0\" \"$@\"\n" + \ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\ + \ 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) - printf "%s" "$0" > "$program_path/ephemeral_component.py" - _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ @@ -96,34 +90,27 @@ deploymentSpec: - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\ - \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ - \ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\ - \ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n" + \ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\ + \ 'scikit-learn==1.4.0' 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) - printf "%s" "$0" > "$program_path/ephemeral_component.py" - _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef normalize_dataset(\n input_iris_dataset: Input[Dataset],\n\ \ normalized_iris_dataset: Output[Dataset],\n standard_scaler: bool,\n\ - \ min_max_scaler: bool,\n):\n if standard_scaler is min_max_scaler:\n\ - \ raise ValueError(\n 'Exactly one of standard_scaler\ - \ or min_max_scaler must be True.')\n\n import pandas as pd\n from\ - \ sklearn.preprocessing import MinMaxScaler\n from sklearn.preprocessing\ - \ import StandardScaler\n\n with open(input_iris_dataset.path) as f:\n\ - \ df = pd.read_csv(f)\n labels = df.pop('Labels')\n\n if standard_scaler:\n\ - \ scaler = StandardScaler()\n if min_max_scaler:\n scaler\ - \ = MinMaxScaler()\n\n df = pd.DataFrame(scaler.fit_transform(df))\n\ - \ df['Labels'] = labels\n normalized_iris_dataset.metadata['state']\ - \ = \"Normalized\"\n with open(normalized_iris_dataset.path, 'w') as\ - \ f:\n df.to_csv(f)\n\n" + ):\n\n import pandas as pd\n from sklearn.preprocessing import MinMaxScaler\n\ + \ from sklearn.preprocessing import StandardScaler\n\n with open(input_iris_dataset.path)\ + \ as f:\n df = pd.read_csv(f)\n labels = df.pop('Labels')\n\n\ + \ scaler = StandardScaler() if standard_scaler else MinMaxScaler()\n\n\ + \ df = pd.DataFrame(scaler.fit_transform(df))\n df['Labels'] = labels\n\ + \ normalized_iris_dataset.metadata['state'] = \"Normalized\"\n with\ + \ open(normalized_iris_dataset.path, 'w') as f:\n df.to_csv(f)\n\n" image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0 exec-train-model: container: @@ -137,18 +124,15 @@ deploymentSpec: - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.7.0'\ - \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ - \ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\ - \ 'scikit-learn==1.4.0' && \"$0\" \"$@\"\n" + \ python3 -m pip install --quiet --no-warn-script-location 'pandas==2.2.0'\ + \ 'scikit-learn==1.4.0' 'kfp==2.0.0-beta.13' && \"$0\" \"$@\"\n" - sh - -ec - 'program_path=$(mktemp -d) - printf "%s" "$0" > "$program_path/ephemeral_component.py" - _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + python3 -m kfp.components.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ @@ -161,7 +145,6 @@ deploymentSpec: \ random_state=0)\n\n clf = KNeighborsClassifier(n_neighbors=n_neighbors)\n\ \ clf.fit(X_train, y_train)\n\n model.metadata['framework'] = 'scikit-learn'\n\ \ with open(model.path, 'wb') as f:\n pickle.dump(clf, f)\n\n" - image: quay.io/opendatahub/ds-pipelines-sample-base:v1.0 pipelineInfo: name: iris-training-pipeline @@ -189,12 +172,9 @@ root: outputArtifactKey: iris_dataset producerTask: create-dataset parameters: - min_max_scaler: - runtimeValue: - constant: false standard_scaler: runtimeValue: - constant: true + constant: 1.0 taskInfo: name: normalize-dataset train-model: @@ -217,11 +197,9 @@ root: name: train-model inputDefinitions: parameters: - min_max_scaler: - parameterType: BOOLEAN neighbors: parameterType: NUMBER_INTEGER standard_scaler: parameterType: BOOLEAN schemaVersion: 2.1.0 -sdkVersion: kfp-2.7.0 +sdkVersion: kfp-2.0.0-beta.13