From 06d375eba8c79058ea7f1da6ebfc0758973c891f Mon Sep 17 00:00:00 2001 From: Yaron Haviv Date: Wed, 29 Jul 2020 16:10:52 +0300 Subject: [PATCH] add describe step --- gitops_project.ipynb | 419 +++++++++++++++++++++++++++++++++++++------ project.yaml | 10 +- workflow.py | 7 +- 3 files changed, 375 insertions(+), 61 deletions(-) diff --git a/gitops_project.ipynb b/gitops_project.ipynb index 10298d8..110f379 100644 --- a/gitops_project.ipynb +++ b/gitops_project.ipynb @@ -166,25 +166,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "[mlrun] 2020-06-10 12:28:42,497 starting run iris_gen uid=40270e2513c14a8996949a8e8a5d4a7e -> http://mlrun-api:8080\n", - "[mlrun] 2020-06-10 12:28:42,533 saving iris dataframe to /User/demo-github-actions/data\n", - "[mlrun] 2020-06-10 12:28:42,594 log artifact iris_dataset at /User/demo-github-actions/data/iris_dataset.csv, size: 2776, db: Y\n", - "\n" + "> 2020-07-29 10:38:35,433 [info] starting run iris_gen uid=3e340d3561ca402c91e9bb09b1631dd4 -> http://mlrun-api:8080\n", + "> 2020-07-29 10:38:35,518 [info] saving iris dataframe to /User/demo-github-actions/data\n" ] }, { "data": { "text/html": [ - "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
gitops-project0Jul 29 12:46:57completeddescribe-summarize
v3io_user=admin
kind=job
owner=admin
host=describe-summarize-r9tvz
table
label_column=label
histograms
violin
imbalance
imbalance-weights-vec
correlation-matrix
correlation
\n", + "
\n", + "
\n", + "
\n", + " Title\n", + " ×\n", + "
\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to track results use .show() or .logs() or in CLI: \n", + "!mlrun get run 301ab10adbf34adb898f0751c7f0f0b4 --project gitops-project , !mlrun logs 301ab10adbf34adb898f0751c7f0f0b4 --project gitops-project\n", + "> 2020-07-29 12:47:11,671 [info] run executed, status=completed\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "skproj.func('describe').apply(mount_v3io()).run(params={'label_column': 'label'}, \n", + " inputs={\"table\": gen.outputs['iris_dataset']}, \n", + " artifact_path=artifact_path)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -436,7 +728,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 9, @@ -476,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -514,7 +806,6 @@ ")\n", "def kfpipeline():\n", " \n", - " \n", " # run the ingestion function with the new image and params\n", " ingest = funcs['gen-iris'].as_step(\n", " name=\"get-data\",\n", @@ -522,6 +813,12 @@ " params={'format': 'pq'},\n", " outputs=[DATASET])\n", "\n", + " # analyze our dataset\n", + " describe = funcs[\"describe\"].as_step(\n", + " name=\"summary\",\n", + " params={\"label_column\": LABELS},\n", + " inputs={\"table\": ingest.outputs[DATASET]})\n", + " \n", " # train with hyper-paremeters\n", " train = funcs[\"train\"].as_step(\n", " name=\"train\",\n", @@ -555,7 +852,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -572,11 +869,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "skproj.artifact_path = 'v3io:///users/admin/pipe/{{workflow.uid}}'\n", + "skproj.artifact_path = 'v3io:///users/{{run.user}}/pipe/{{workflow.uid}}'\n", "skproj.save()" ] }, @@ -593,7 +890,7 @@ "The workflow ID is returned and can be used to track the progress or you can use the hyperlinks\n", "\n", "> Note: The same command can be issued through CLI commands:
\n", - " `mlrun project my-proj/ -r main -p \"v3io:///users/admin/mlrun/kfp/{{workflow.uid}}/\"`\n", + " `mlrun project my-proj/ -r main -p \"v3io:///users/{{run.user}}/mlrun/kfp/{{workflow.uid}}/\"`\n", "\n", "The `dirty` flag allow us to run a project with uncommited changes (when the notebook is in the same git dir it will always be dirty)
\n", "The `watch` flag will wait for the pipeline to complete" @@ -601,7 +898,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -611,13 +908,13 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Experiment link here" + "Experiment link here" ], "text/plain": [ "" @@ -629,7 +926,7 @@ { "data": { "text/html": [ - "Run link here" + "Run link here" ], "text/plain": [ "" @@ -642,14 +939,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "[mlrun] 2020-06-10 12:29:40,557 Pipeline run id=7a60c6c1-b765-4702-919e-29ceec31dc20, check UI or DB for progress\n", - "[mlrun] 2020-06-10 12:29:40,558 waiting for pipeline run completion\n" + "> 2020-07-29 13:04:18,155 [info] Pipeline run id=8f462295-2154-428a-b861-4ec8be504832, check UI or DB for progress\n", + "> 2020-07-29 13:04:18,156 [info] waiting for pipeline run completion\n" ] }, { "data": { "text/html": [ - "

Run Results

Workflow 7a60c6c1-b765-4702-919e-29ceec31dc20 finished, status=Succeeded
click the hyper links below to see detailed results
\n", + "

Run Results

Workflow 8f462295-2154-428a-b861-4ec8be504832 finished, status=Succeeded
click the hyper links below to see detailed results
\n", " \n", " \n", " \n", @@ -662,36 +959,44 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
uid
Jun 10 12:30:29Jul 29 13:05:03completedmodel-tester
total_tests=15
errors=0
match=14
avg_latency=12694
min_latency=11730
max_latency=16535
latency
total_tests=15
errors=0
match=14
avg_latency=11446
min_latency=11047
max_latency=12131
latency
Jun 10 12:30:16Jul 29 13:04:54completedtest
rocauc=0.46440904774238106
avg_precscore=0.40055555555555555
accuracy=0.9333333333333333
f1_score=0.9333333333333333
roc
confusion
featimp
featimp-tbl
test_set_preds
accuracy=0.9333333333333333
test-error=0.06666666666666667
auc-micro=0.9655555555555556
auc-weighted=0.9888888888888889
f1-score=0.9137254901960784
precision_score=0.8888888888888888
recall_score=0.9629629629629629
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
test_set_preds
Jul 29 13:04:37completedsummary
histograms
violin
imbalance
imbalance-weights-vec
correlation-matrix
correlation
Jun 10 12:30:00Jul 29 13:04:36completedtrain
best_iteration=1
rocauc=0.9945117845117846
accuracy=0.9705882352941176
f1_score=0.9705882352941176
test_set
roc
confusion
model
iteration_results
best_iteration=1
accuracy=0.9705882352941176
test-error=0.029411764705882353
auc-micro=0.9969723183391004
auc-weighted=0.9949732620320856
f1-score=0.9679633867276888
precision_score=0.9666666666666667
recall_score=0.9722222222222222
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
iteration_results
Jun 10 12:29:50Jul 29 13:04:26completedget-data
iris_dataset
iris_dataset
" @@ -734,7 +1039,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.6" } }, "nbformat": 4, diff --git a/project.yaml b/project.yaml index 2702742..14cf899 100644 --- a/project.yaml +++ b/project.yaml @@ -19,15 +19,17 @@ functions: doc: '' parameters: - name: context + default: '' - name: format default: csv - outputs: [] + outputs: + - default: '' lineno: 11 description: '' build: functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmZyb20gc2tsZWFybi5kYXRhc2V0cyBpbXBvcnQgbG9hZF9pcmlzCmZyb20gc2tsZWFybi5tb2RlbF9zZWxlY3Rpb24gaW1wb3J0IHRyYWluX3Rlc3Rfc3BsaXQKaW1wb3J0IG51bXB5IGFzIG5wCmZyb20gc2tsZWFybi5tZXRyaWNzIGltcG9ydCBhY2N1cmFjeV9zY29yZQpmcm9tIG1scnVuLmFydGlmYWN0cyBpbXBvcnQgVGFibGVBcnRpZmFjdCwgUGxvdEFydGlmYWN0CmltcG9ydCBwYW5kYXMgYXMgcGQKCmRlZiBpcmlzX2dlbmVyYXRvcihjb250ZXh0LCBmb3JtYXQ9J2NzdicpOgogICAgaXJpcyA9IGxvYWRfaXJpcygpCiAgICBpcmlzX2RhdGFzZXQgPSBwZC5EYXRhRnJhbWUoZGF0YT1pcmlzLmRhdGEsIGNvbHVtbnM9aXJpcy5mZWF0dXJlX25hbWVzKQogICAgaXJpc19sYWJlbHMgPSBwZC5EYXRhRnJhbWUoZGF0YT1pcmlzLnRhcmdldCwgY29sdW1ucz1bJ2xhYmVsJ10pCiAgICBpcmlzX2RhdGFzZXQgPSBwZC5jb25jYXQoW2lyaXNfZGF0YXNldCwgaXJpc19sYWJlbHNdLCBheGlzPTEpCiAgICAKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oJ3NhdmluZyBpcmlzIGRhdGFmcmFtZSB0byB7fScuZm9ybWF0KGNvbnRleHQuYXJ0aWZhY3RfcGF0aCkpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KCdpcmlzX2RhdGFzZXQnLCBkZj1pcmlzX2RhdGFzZXQsIGZvcm1hdD1mb3JtYXQsIGluZGV4PUZhbHNlKQoK commands: [] - code_origin: https://github.com/mlrun/demo-github-actions.git#3395573d8f1c7ad4725314afb3d067751bbea465:gen_iris.ipynb + code_origin: https://github.com/mlrun/demo-github-actions.git#0e717588b1354d3d60cd96ba5c352d71aace0552 - url: hub://sklearn_classifier name: train - url: hub://test_classifier @@ -36,8 +38,10 @@ functions: name: serving - url: hub://model_server_tester name: live_tester +- url: hub://describe + name: describe workflows: - name: main path: workflow.py artifacts: [] -artifact_path: v3io:///users/admin/pipe/{{workflow.uid}} +artifact_path: v3io:///users/{{run.user}}/pipe/{{workflow.uid}} diff --git a/workflow.py b/workflow.py index 91606c5..23961c8 100644 --- a/workflow.py +++ b/workflow.py @@ -23,7 +23,6 @@ def init_functions(functions: dict, project=None, secrets=None): ) def kfpipeline(): - # run the ingestion function with the new image and params ingest = funcs['gen-iris'].as_step( name="get-data", @@ -31,6 +30,12 @@ def kfpipeline(): params={'format': 'pq'}, outputs=[DATASET]) + # analyze our dataset + describe = funcs["describe"].as_step( + name="summary", + params={"label_column": LABELS}, + inputs={"table": ingest.outputs[DATASET]}) + # train with hyper-paremeters train = funcs["train"].as_step( name="train",